diff --git a/main.py b/main.py index 093fd51..013f235 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,9 @@ session = requests.Session() urls = queue.Queue() urls.put(INDEX_URL) +# Create list to store visited sites +visited = set([INDEX_URL]) + # Loop while not urls.empty(): url = urls.get() @@ -31,7 +34,12 @@ while not urls.empty(): print(hrefs) # Add to queue - print("adding %d new urls"%len(hrefs)) + print("found %d new urls"%len(hrefs), end="") + numAdded = 0 for href in hrefs: - urls.put(href) + if href not in visited: + urls.put(href) + visited.add(href) + numAdded += 1 + print(", of which %d new"%numAdded)