From 52b3612a8d870c7b2b24124c27c0cab0bab432df Mon Sep 17 00:00:00 2001
From: bitscuit <thomas@bitscuit.be>
Date: Sat, 8 May 2021 21:15:33 +0200
Subject: [PATCH] Fixed some small issues

---
 main.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 39d39ae..ec6db36 100644
--- a/main.py
+++ b/main.py
@@ -29,6 +29,11 @@ while not urls.empty():
     log.info("Fetching '%s'..."%url)
     try:
         r = session.get(url, stream=True, timeout=5)
+
+        # Read response
+        if r.status_code != 200:
+            log.info("returned status %d"%r.status_code)
+            continue
         
         # Check file size
         if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
@@ -44,16 +49,11 @@ while not urls.empty():
     finally:
         r.close()
 
-    # Read response
-    if r.status_code != 200:
-        log.info("returned status %d"%r.status_code)
-        continue
-
     # Filter page for hrefs
     hrefs = []
     try:
         hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
-        pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
+        pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
         hrefs.extend(pages)
     except Exception as e:
         log.info("failed to parse page")