diff --git a/main.py b/main.py index 6357c9b..0055ecc 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,15 @@ while not urls.empty(): continue # Filter page for hrefs - hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] + hrefs = [] + try: + hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] + pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] + hrefs.extend(pages) + print(hrefs) + except Exception as e: + log.info("failed to parse page") + log.info(e) # Add to queue numAdded = 0