# # Copyright (C) 2021 Thomas Van Acker # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see .""") # # Shammer is hosted on Gitscuit: # import requests import queue import re import logging as log import time import random import json import datetime # Init logging log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s") log.info("Shammer is getting ready...") # Init config try: log.debug("Loading 'config.json'...") f = open("config.json", "r") config = json.load(f) f.close() except IOError as e: # Doesn't exist yet log.warn("Couldn't open 'config.json'") log.warn("%s"%e) log.info("Creating default configuration file...") config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000, "REGEX_TIMEOUT":10} f = open("config.json", "w") json.dump(config, f, indent="\t") f.close() log.info("Done. Please edit 'config.json' before running Shammer again!") exit() except Exception as e: # Other error log.warn("Couldn't open 'config.json'") log.warn("%s"%e) exit() # Init static vars INDEX_URL = config["INDEX_URL"] MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"] REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"] HOUR_START = config["HOUR_START"] HOUR_STOP = config["HOUR_STOP"] REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"] REGEX_TIMEOUT = config["REGEX_TIMEOUT"] # Create session session = requests.Session() # Create website queue urls = queue.Queue() urls.put(INDEX_URL) # Create list to store visited sites visited = set([INDEX_URL]) numFetched = 0 # Loop while not urls.empty(): # Check time hour = datetime.datetime.now().hour if not (hour >= HOUR_START and hour < HOUR_STOP): log.debug("Not right time yet, sleeping") time.sleep(60) continue # Wait random time if REQUESTS_PER_MINUTE != 0: l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE) time.sleep(l) # Get next url from queue url = urls.get() # Perform request log.info("Fetch %d: '%s'..."%(numFetched, url)) numFetched += 1 try: r = session.get(url, stream=True, timeout=5) r.close() # Read response if r.status_code != 200: log.info("returned status %d"%r.status_code) continue # Check file size if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH): log.info("too large response") continue # Check file type if ("Content-Type" in r.headers.keys() and not r.headers["Content-Type"].startswith("text/")) or ("content-type" in r.headers.keys() and not r.headers["content-type"].startswith("text/")): log.info("response not text") continue # Download full r = session.get(url, timeout=5) r.close() except Exception as e: log.info("failed") log.info(e) # Filter page for hrefs timeStart = time.perf_counter() hrefs = [] try: for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE): # Check if timeout if time.perf_counter() - timeStart > REGEX_TIMEOUT: log.info("regex timeout") break # Apply regex log.debug("regex on chunk %d"%chunkIndex) chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))] hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"']([a-z0-9\/]+(\.[a-z0-9]*)?)[\"']", chunk)]) hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)]) except Exception as e: log.info("failed to parse page") log.info(e) # Add to queue numAdded = 0 for href in hrefs: if href not in visited: urls.put(href) visited.add(href) numAdded += 1 log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))