import requests import queue import re import logging as log # Init logging log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s") log.info("Shammer is getting ready...") # Init static vars INDEX_URL = "https://bitscuit.be/" MAX_CONTENT_LENGTH = 500000 # Create session session = requests.Session() # Create website queue urls = queue.Queue() urls.put(INDEX_URL) # Create list to store visited sites visited = set([INDEX_URL]) # Loop while not urls.empty(): url = urls.get() # Perform request log.info("Fetching '%s'..."%url) try: r = session.get(url, stream=True) # Check file size if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): log.info("too large response") continue # Download full r = session.get(url) except Exception as e: log.info("failed") log.info(e) finally: r.close() # Read response if r.status_code != 200: log.info("returned status %d"%r.status_code) continue # Filter page for hrefs hrefs = [] try: hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] pages = [(url[:url.rfind("/")+1]+res[0][(1 if res[0][0] == "/" else 0):]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] hrefs.extend(pages) print(hrefs) except Exception as e: log.info("failed to parse page") log.info(e) # Add to queue numAdded = 0 for href in hrefs: if href not in visited: urls.put(href) visited.add(href) numAdded += 1 log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))