Fixed some small issues
This commit is contained in:
parent
95e4b27bc3
commit
52b3612a8d
12
main.py
12
main.py
|
@ -29,6 +29,11 @@ while not urls.empty():
|
||||||
log.info("Fetching '%s'..."%url)
|
log.info("Fetching '%s'..."%url)
|
||||||
try:
|
try:
|
||||||
r = session.get(url, stream=True, timeout=5)
|
r = session.get(url, stream=True, timeout=5)
|
||||||
|
|
||||||
|
# Read response
|
||||||
|
if r.status_code != 200:
|
||||||
|
log.info("returned status %d"%r.status_code)
|
||||||
|
continue
|
||||||
|
|
||||||
# Check file size
|
# Check file size
|
||||||
if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
|
if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
|
||||||
|
@ -44,16 +49,11 @@ while not urls.empty():
|
||||||
finally:
|
finally:
|
||||||
r.close()
|
r.close()
|
||||||
|
|
||||||
# Read response
|
|
||||||
if r.status_code != 200:
|
|
||||||
log.info("returned status %d"%r.status_code)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Filter page for hrefs
|
# Filter page for hrefs
|
||||||
hrefs = []
|
hrefs = []
|
||||||
try:
|
try:
|
||||||
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
||||||
pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
||||||
hrefs.extend(pages)
|
hrefs.extend(pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.info("failed to parse page")
|
log.info("failed to parse page")
|
||||||
|
|
Loading…
Reference in New Issue