From 95e4b27bc3b66fdbf6a4b32ca20f1b41f8f8e6e2 Mon Sep 17 00:00:00 2001 From: bitscuit Date: Sat, 8 May 2021 21:07:00 +0200 Subject: [PATCH] Added timeout --- main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index f606d28..39d39ae 100644 --- a/main.py +++ b/main.py @@ -28,7 +28,7 @@ while not urls.empty(): # Perform request log.info("Fetching '%s'..."%url) try: - r = session.get(url, stream=True) + r = session.get(url, stream=True, timeout=5) # Check file size if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): @@ -36,7 +36,7 @@ while not urls.empty(): continue # Download full - r = session.get(url) + r = session.get(url, timeout=5) except Exception as e: log.info("failed") @@ -55,7 +55,6 @@ while not urls.empty(): hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] hrefs.extend(pages) - print(hrefs) except Exception as e: log.info("failed to parse page") log.info(e)