Also parse non-index urls
This commit is contained in:
parent
0462614a22
commit
9fa95aba03
10
main.py
10
main.py
|
@ -40,7 +40,15 @@ while not urls.empty():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Filter page for hrefs
|
# Filter page for hrefs
|
||||||
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
hrefs = []
|
||||||
|
try:
|
||||||
|
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
||||||
|
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
||||||
|
hrefs.extend(pages)
|
||||||
|
print(hrefs)
|
||||||
|
except Exception as e:
|
||||||
|
log.info("failed to parse page")
|
||||||
|
log.info(e)
|
||||||
|
|
||||||
# Add to queue
|
# Add to queue
|
||||||
numAdded = 0
|
numAdded = 0
|
||||||
|
|
Loading…
Reference in New Issue