Added timeout
This commit is contained in:
parent
a09bc1da54
commit
95e4b27bc3
5
main.py
5
main.py
|
@ -28,7 +28,7 @@ while not urls.empty():
|
|||
# Perform request
|
||||
log.info("Fetching '%s'..."%url)
|
||||
try:
|
||||
r = session.get(url, stream=True)
|
||||
r = session.get(url, stream=True, timeout=5)
|
||||
|
||||
# Check file size
|
||||
if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
|
||||
|
@ -36,7 +36,7 @@ while not urls.empty():
|
|||
continue
|
||||
|
||||
# Download full
|
||||
r = session.get(url)
|
||||
r = session.get(url, timeout=5)
|
||||
|
||||
except Exception as e:
|
||||
log.info("failed")
|
||||
|
@ -55,7 +55,6 @@ while not urls.empty():
|
|||
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
||||
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
||||
hrefs.extend(pages)
|
||||
print(hrefs)
|
||||
except Exception as e:
|
||||
log.info("failed to parse page")
|
||||
log.info(e)
|
||||
|
|
Loading…
Reference in New Issue