Don't load already visited sites
This commit is contained in:
parent
3866b2be0f
commit
a9a07aabea
12
main.py
12
main.py
|
@ -12,6 +12,9 @@ session = requests.Session()
|
||||||
urls = queue.Queue()
|
urls = queue.Queue()
|
||||||
urls.put(INDEX_URL)
|
urls.put(INDEX_URL)
|
||||||
|
|
||||||
|
# Create list to store visited sites
|
||||||
|
visited = set([INDEX_URL])
|
||||||
|
|
||||||
# Loop
|
# Loop
|
||||||
while not urls.empty():
|
while not urls.empty():
|
||||||
url = urls.get()
|
url = urls.get()
|
||||||
|
@ -31,7 +34,12 @@ while not urls.empty():
|
||||||
print(hrefs)
|
print(hrefs)
|
||||||
|
|
||||||
# Add to queue
|
# Add to queue
|
||||||
print("adding %d new urls"%len(hrefs))
|
print("found %d new urls"%len(hrefs), end="")
|
||||||
|
numAdded = 0
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
urls.put(href)
|
if href not in visited:
|
||||||
|
urls.put(href)
|
||||||
|
visited.add(href)
|
||||||
|
numAdded += 1
|
||||||
|
print(", of which %d new"%numAdded)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue