Also parse non-index urls
This commit is contained in:
parent
0462614a22
commit
9fa95aba03
8
main.py
8
main.py
|
@ -40,7 +40,15 @@ while not urls.empty():
|
|||
continue
|
||||
|
||||
# Filter page for hrefs
|
||||
hrefs = []
|
||||
try:
|
||||
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
||||
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
||||
hrefs.extend(pages)
|
||||
print(hrefs)
|
||||
except Exception as e:
|
||||
log.info("failed to parse page")
|
||||
log.info(e)
|
||||
|
||||
# Add to queue
|
||||
numAdded = 0
|
||||
|
|
Loading…
Reference in New Issue