vnp/sites/FreesmscenterCom.py

217 lines
6.3 KiB
Python

"""Module for parsing freesmscenter.com
NOTICE: this can be rewritten in more clean manner in future
"""
import re
import WebSiteParser
class FreesmscenterCom(WebSiteParser.WebSiteParser):
def __init__(self):
super().__init__("freesmscenter.com")
def Parse(self):
"""Perform parsing of entire web site.
"""
# Parsing main page with list of countries
country_uris = self.ParseURIsFromPage(
"<a href=\"/FreeSms/.+-Phone-Number\" class=\"btn btn-primary\">Show Phone Numbers</a>",
"<a href=\"/",
"\" class=\"btn btn-primary\">Show Phone Numbers</a>"
)
# Parsing all single number from page with country's number list
numbers_raw = []
for country_uri in country_uris:
tmp = country_uri.replace("FreeSms", "SmsReceive")
number_uris = self.ParseURIsFromPage(
f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
"<a href=\"/",
"\" class=\"btn btn-primary\">Receive SMS</a>",
country_uri
)
if not number_uris:
number_uris = self.ParseURIsFromPage(
f"<a href=\"/SmsReceive/.+/[0-9]+\" class=\"btn btn-primary\">Receive SMS</a>",
"<a href=\"/",
"\" class=\"btn btn-primary\">Receive SMS</a>",
country_uri
)
for number_uri in number_uris:
numbers_raw.append({
"uri_raw": number_uri,
"country": country_uri.replace("FreeSms/", "").replace("-Phone-Number", "").replace("%20", " ")
})
# Parse number's page
for i in range(len(numbers_raw)):
numbers_raw[i].update(
self.ParseNumberPage(numbers_raw[i]["uri_raw"])
)
self.ProcessRawNumbers(numbers_raw)
self.ParseDone = True
self.Log("parsing done")
def ParseURIsFromPage(self, exp, r1, r2, uri=""):
"""Parse all URIs from selected page.
"""
markup = self.RequestPage(uri).text
peaces_of_markup = re.findall(exp, markup)
result_uris = []
for peace in peaces_of_markup:
result_uris.append(
peace.replace(r1, "").replace(r2, "")
)
return result_uris
def ParseNumberPage(self, uri):
"""Parse page with history of messages, related to single number.
NOTICE: this can be better,
but a lot of hardcoded sheat is required then,
just because original site has a lot of bugs.
However, i fixed most of major things.
"""
def die(text):
self.Log(text, f"ParseNumberPage(self at {id(self)}, \"{uri}\")")
def p(l):
for i in range(len(l)):
if not l[i]:
l[i] = self.EMPTY
return l
result = {
"number": self.EMPTY,
"last_use_raw": self.EMPTY,
"born_raw": self.EMPTY,
"times_used": self.EMPTY
}
numExp = "<h1 id=\"numberget\" class=\"cw\">\+[0-9]+</h1>"
tableCellExp = "<td.*\">.*</td>"
cellFromExp = "<td.*datesize3.*>*</td>"
cellContExp = "<td.*datesize5.*>.*</td>"
cellTimeExp = "<td.*datesize4.*>.*</td>"
markup = self.RequestPage(uri).text
number = re.findall(numExp, markup)
if not number:
die("error: can't parse page, number is empty")
self.ErrorsWhileParsing += 1
return result
result["number"] = re.findall("\+[0-9]+", number[0])[0]
tableCellsRaw = re.findall(tableCellExp, markup)
if not len(tableCellsRaw):
die("error: can't parse page, tableCellsRaw is empty")
self.ErrorsWhileParsing += 1
return result
elif len(tableCellsRaw) % 3:
die("warning: length tableCellsRaw should be divisible by 3, incorrect results are expected")
tableLines = []
tmpLine = [None, None, None]
for i in range(0, len(tableCellsRaw)):
if re.search(cellFromExp, tableCellsRaw[i]) and (i < len(tableCellsRaw) - 3):
if re.search(cellContExp, tableCellsRaw[i+1]):
if re.search(cellTimeExp, tableCellsRaw[i+2]):
tableLines.append([
tableCellsRaw[i],
tableCellsRaw[i+1],
tableCellsRaw[i+2]
])
tmpLine = [None, None, None]
i += 2
continue
if re.search(cellFromExp, tableCellsRaw[i]) and not tmpLine[0]:
if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
tmpLine[0] = self.EMPTY
else:
tmpLine[0] = tableCellsRaw[i]
elif tmpLine[0]:
tmpLine = p(tmpLine)
if re.search(cellContExp, tableCellsRaw[i]) and not tmpLine[1]:
# if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
# tmpLine[1] = self.EMPTY
# else:
tmpLine[1] = tableCellsRaw[i]
elif tmpLine[1]:
tmpLine = p(tmpLine)
if re.search(cellTimeExp, tableCellsRaw[i]) and not tmpLine[2]:
if not re.search("[a-zA-Z0-9]", tableCellsRaw[i]):
tmpLine[2] = self.EMPTY
else:
tmpLine[2] = tableCellsRaw[i]
elif tmpLine[2]:
tmpLine = p(tmpLine)
if tmpLine[0] and tmpLine[1] and tmpLine[2]:
tableLines.append(tmpLine)
tmpLine = [None, None, None]
if not tableLines:
die("error: can't parse page, tableLines is empty")
self.ErrorsWhileParsing += 1
return result
for i in range(len(tableLines)):
if tableLines[i][2] != self.EMPTY:
result["last_use_raw"] = re.sub(
"(<td.*\">)|(</td>)",
"",
tableLines[i][2]
)
break
for i in range(len(tableLines)-1, -1, -1):
if tableLines[i][2] != self.EMPTY:
result["born_raw"] = re.sub(
"(<td.*\">)|(</td>)",
"",
tableLines[i][2]
)
break
result["times_used"] = len(tableLines)
return result
def ProcessRawNumbers(self, raws):
"""Process list of raw parsed numbers and make them look cool ayo.
"""
for i in range(0, len(raws)):
if not re.search("\+[0-9]+", raws[i]["number"]):
self.Log(f"error: incorrect number: \"{raws[i]['number']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
continue
if not re.search("[a-zA-Z0-9]+", raws[i]["last_use_raw"]):
self.Log(f"warning: malformed last_use_raw field: \"{raws[i]['last_use_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
raws[i]["last_use_raw"] = self.EMPTY
raws[i]["last_use"] = raws[i]["last_use_raw"]
if not re.search("[a-zA-Z0-9]+", raws[i]["born_raw"]):
self.Log(f"warning: malformed born_raw field: \"{raws[i]['born_raw']}\"", f"ProcessRawNumbers(self at {id(self)}, raws at {id(raws)})")
raws[i]["born_raw"] = self.EMPTY
raws[i]["born"] = raws[i]["born_raw"]
raws[i]["url"] = self.WebSiteFullURL + raws[i]["uri_raw"]
if raws[i]["times_used"] == 0:
self.Output["not_used"].append(raws[i])
elif raws[i]["times_used"] < self.AlmostNotUsedThreshold:
self.Output["almost_not_used"].append(raws[i])
else:
self.Output["heavily_used"].append(raws[i])