import requests import queue import re import logging as log # Init logging log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s") log.info("Shammer is getting ready...") # Init static vars INDEX_URL = "https://bitscuit.be/" # Create session session = requests.Session() # Create website queue urls = queue.Queue() urls.put(INDEX_URL) # Create list to store visited sites visited = set([INDEX_URL]) # Loop while not urls.empty(): url = urls.get() # Perform request log.info("Fetching '%s'..."%url) try: r = session.get(url) except Exception as e: log.info("failed") log.info(e) finally: r.close() # Read response if r.status_code != 200: log.info("returned status %d"%r.status_code) continue # Filter page for hrefs hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] # Add to queue numAdded = 0 for href in hrefs: if href not in visited: urls.put(href) visited.add(href) numAdded += 1 log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))