Fixed some small issues

2021-05-08 21:15:33 +02:00 · 2021-05-08 21:15:33 +02:00 · 52b3612a8d
commit 52b3612a8d
parent 95e4b27bc3
1 changed files with 6 additions and 6 deletions
--- a/main.py
+++ b/main.py
@ -30,6 +30,11 @@ while not urls.empty():
    try:
        r = session.get(url, stream=True, timeout=5)

+        # Read response
+        if r.status_code != 200:
+            log.info("returned status %d"%r.status_code)
+            continue
+        
        # Check file size
        if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
            log.info("too large response")
@ -44,16 +49,11 @@ while not urls.empty():
    finally:
        r.close()

-    # Read response
-    if r.status_code != 200:
-        log.info("returned status %d"%r.status_code)
-        continue
-
    # Filter page for hrefs
    hrefs = []
    try:
        hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
-        pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
+        pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
        hrefs.extend(pages)
    except Exception as e:
        log.info("failed to parse page")