Shammer/main.py

38 lines
749 B
Python

import requests
import queue
import re
# Init static vars
INDEX_URL = "https://bitscuit.be/"
# Create session
session = requests.Session()
# Create website queue
urls = queue.Queue()
urls.put(INDEX_URL)
# Loop
while not urls.empty():
url = urls.get()
# Perform request
print("Fetching url '%s'..."%url, end="")
r = session.get(url)
print("\tdone")
# Read response
if r.status_code != 200:
print("returned %d"%r.status_code)
continue
# Filter page for hrefs
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
print(hrefs)
# Add to queue
print("adding %d new urls"%len(hrefs))
for href in hrefs:
urls.put(href)