From 52b3612a8d870c7b2b24124c27c0cab0bab432df Mon Sep 17 00:00:00 2001 From: bitscuit Date: Sat, 8 May 2021 21:15:33 +0200 Subject: [PATCH] Fixed some small issues --- main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 39d39ae..ec6db36 100644 --- a/main.py +++ b/main.py @@ -29,6 +29,11 @@ while not urls.empty(): log.info("Fetching '%s'..."%url) try: r = session.get(url, stream=True, timeout=5) + + # Read response + if r.status_code != 200: + log.info("returned status %d"%r.status_code) + continue # Check file size if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): @@ -44,16 +49,11 @@ while not urls.empty(): finally: r.close() - # Read response - if r.status_code != 200: - log.info("returned status %d"%r.status_code) - continue - # Filter page for hrefs hrefs = [] try: hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] - pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] + pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] hrefs.extend(pages) except Exception as e: log.info("failed to parse page")