Don't load already visited sites

This commit is contained in:
bitscuit 2021-05-08 18:00:49 +02:00
parent 3866b2be0f
commit a9a07aabea
1 changed files with 10 additions and 2 deletions

12
main.py
View File

@ -12,6 +12,9 @@ session = requests.Session()
urls = queue.Queue() urls = queue.Queue()
urls.put(INDEX_URL) urls.put(INDEX_URL)
# Create list to store visited sites
visited = set([INDEX_URL])
# Loop # Loop
while not urls.empty(): while not urls.empty():
url = urls.get() url = urls.get()
@ -31,7 +34,12 @@ while not urls.empty():
print(hrefs) print(hrefs)
# Add to queue # Add to queue
print("adding %d new urls"%len(hrefs)) print("found %d new urls"%len(hrefs), end="")
numAdded = 0
for href in hrefs: for href in hrefs:
urls.put(href) if href not in visited:
urls.put(href)
visited.add(href)
numAdded += 1
print(", of which %d new"%numAdded)