From 9fa95aba037bf7fd64d86440b4545ee1a03a960c Mon Sep 17 00:00:00 2001 From: bitscuit Date: Sat, 8 May 2021 20:31:44 +0200 Subject: [PATCH] Also parse non-index urls --- main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 6357c9b..0055ecc 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,15 @@ while not urls.empty(): continue # Filter page for hrefs - hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] + hrefs = [] + try: + hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] + pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] + hrefs.extend(pages) + print(hrefs) + except Exception as e: + log.info("failed to parse page") + log.info(e) # Add to queue numAdded = 0