From 9fa95aba037bf7fd64d86440b4545ee1a03a960c Mon Sep 17 00:00:00 2001
From: bitscuit <thomas@bitscuit.be>
Date: Sat, 8 May 2021 20:31:44 +0200
Subject: [PATCH] Also parse non-index urls

---
 main.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 6357c9b..0055ecc 100644
--- a/main.py
+++ b/main.py
@@ -40,7 +40,15 @@ while not urls.empty():
         continue
 
     # Filter page for hrefs
-    hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
+    hrefs = []
+    try:
+        hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
+        pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
+        hrefs.extend(pages)
+        print(hrefs)
+    except Exception as e:
+        log.info("failed to parse page")
+        log.info(e)
 
     # Add to queue
     numAdded = 0