From a9a07aabea026825c3a3d1792df762e2a3d312da Mon Sep 17 00:00:00 2001 From: bitscuit Date: Sat, 8 May 2021 18:00:49 +0200 Subject: [PATCH] Don't load already visited sites --- main.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 093fd51..013f235 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,9 @@ session = requests.Session() urls = queue.Queue() urls.put(INDEX_URL) +# Create list to store visited sites +visited = set([INDEX_URL]) + # Loop while not urls.empty(): url = urls.get() @@ -31,7 +34,12 @@ while not urls.empty(): print(hrefs) # Add to queue - print("adding %d new urls"%len(hrefs)) + print("found %d new urls"%len(hrefs), end="") + numAdded = 0 for href in hrefs: - urls.put(href) + if href not in visited: + urls.put(href) + visited.add(href) + numAdded += 1 + print(", of which %d new"%numAdded)