Also parse non-index urls

2021-05-08 20:31:44 +02:00 · 2021-05-08 20:31:44 +02:00 · 9fa95aba03
commit 9fa95aba03
parent 0462614a22
1 changed files with 9 additions and 1 deletions
--- a/main.py
+++ b/main.py
@ -40,7 +40,15 @@ while not urls.empty():
        continue
    # Filter page for hrefs
-    hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
+    hrefs = []
    try:
        hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
        pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
        hrefs.extend(pages)
        print(hrefs)
    except Exception as e:
        log.info("failed to parse page")
        log.info(e)
    # Add to queue
    numAdded = 0