Fixed some small issues

This commit is contained in:
bitscuit 2021-05-08 21:15:33 +02:00
parent 95e4b27bc3
commit 52b3612a8d
1 changed files with 6 additions and 6 deletions

12
main.py
View File

@ -30,6 +30,11 @@ while not urls.empty():
try: try:
r = session.get(url, stream=True, timeout=5) r = session.get(url, stream=True, timeout=5)
# Read response
if r.status_code != 200:
log.info("returned status %d"%r.status_code)
continue
# Check file size # Check file size
if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
log.info("too large response") log.info("too large response")
@ -44,16 +49,11 @@ while not urls.empty():
finally: finally:
r.close() r.close()
# Read response
if r.status_code != 200:
log.info("returned status %d"%r.status_code)
continue
# Filter page for hrefs # Filter page for hrefs
hrefs = [] hrefs = []
try: try:
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
hrefs.extend(pages) hrefs.extend(pages)
except Exception as e: except Exception as e:
log.info("failed to parse page") log.info("failed to parse page")