Also parse non-index urls

This commit is contained in:
bitscuit 2021-05-08 20:31:44 +02:00
parent 0462614a22
commit 9fa95aba03
1 changed files with 9 additions and 1 deletions

View File

@ -40,7 +40,15 @@ while not urls.empty():
continue continue
# Filter page for hrefs # Filter page for hrefs
hrefs = []
try:
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
hrefs.extend(pages)
print(hrefs)
except Exception as e:
log.info("failed to parse page")
log.info(e)
# Add to queue # Add to queue
numAdded = 0 numAdded = 0