import requests import queue import re import logging as log import time import random import json import datetime # Init logging log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s") log.info("Shammer is getting ready...") # Init config try: log.debug("Loading 'config.json'...") f = open("config.json", "r") config = json.load(f) f.close() except IOError as e: # Doesn't exist yet log.warn("Couldn't open 'config.json'") log.warn("%s"%e) log.info("Creating default configuration file...") config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23} f = open("config.json", "w") json.dump(config, f, indent="\t") f.close() log.info("Done. Please edit 'config.json' before running Shammer again!") exit() except Exception as e: # Other error log.warn("Couldn't open 'config.json'") log.warn("%s"%e) exit() # Init static vars INDEX_URL = config["INDEX_URL"] MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"] REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"] HOUR_START = config["HOUR_START"] HOUR_STOP = config["HOUR_STOP"] # Create session session = requests.Session() # Create website queue urls = queue.Queue() urls.put(INDEX_URL) # Create list to store visited sites visited = set([INDEX_URL]) # Loop while not urls.empty(): # Check time hour = datetime.datetime.now().hour if not (hour >= HOUR_START and hour < HOUR_STOP): log.debug("Not right time yet, sleeping") time.sleep(60) continue # Get next url from queue url = urls.get() # Perform request log.info("Fetching '%s'..."%url) try: r = session.get(url, stream=True, timeout=5) # Read response if r.status_code != 200: log.info("returned status %d"%r.status_code) continue # Check file size if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): log.info("too large response") continue # Download full r = session.get(url, timeout=5) except Exception as e: log.info("failed") log.info(e) finally: r.close() # Filter page for hrefs hrefs = [] try: hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] hrefs.extend(pages) except Exception as e: log.info("failed to parse page") log.info(e) # Add to queue numAdded = 0 for href in hrefs: if href not in visited: urls.put(href) visited.add(href) numAdded += 1 log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize())) # Wait random time if REQUESTS_PER_MINUTE != 0: l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE) time.sleep(l)