Wrote first version
This commit is contained in:
parent
b3e781bd98
commit
3866b2be0f
|
@ -0,0 +1,37 @@
|
||||||
|
import requests
|
||||||
|
import queue
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Init static vars
|
||||||
|
INDEX_URL = "https://bitscuit.be/"
|
||||||
|
|
||||||
|
# Create session
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
# Create website queue
|
||||||
|
urls = queue.Queue()
|
||||||
|
urls.put(INDEX_URL)
|
||||||
|
|
||||||
|
# Loop
|
||||||
|
while not urls.empty():
|
||||||
|
url = urls.get()
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
print("Fetching url '%s'..."%url, end="")
|
||||||
|
r = session.get(url)
|
||||||
|
print("\tdone")
|
||||||
|
|
||||||
|
# Read response
|
||||||
|
if r.status_code != 200:
|
||||||
|
print("returned %d"%r.status_code)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter page for hrefs
|
||||||
|
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
||||||
|
print(hrefs)
|
||||||
|
|
||||||
|
# Add to queue
|
||||||
|
print("adding %d new urls"%len(hrefs))
|
||||||
|
for href in hrefs:
|
||||||
|
urls.put(href)
|
||||||
|
|
Loading…
Reference in New Issue