2021-05-08 15:43:31 +00:00
|
|
|
import requests
|
|
|
|
import queue
|
|
|
|
import re
|
2021-05-08 17:58:05 +00:00
|
|
|
import logging as log
|
2021-05-08 19:28:13 +00:00
|
|
|
import time
|
|
|
|
import random
|
2021-05-09 05:51:34 +00:00
|
|
|
import json
|
2021-05-09 06:04:39 +00:00
|
|
|
import datetime
|
2021-05-08 17:58:05 +00:00
|
|
|
|
|
|
|
# Init logging
|
|
|
|
log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s")
|
|
|
|
log.info("Shammer is getting ready...")
|
2021-05-08 15:43:31 +00:00
|
|
|
|
2021-05-09 05:51:34 +00:00
|
|
|
# Init config
|
|
|
|
try:
|
2021-05-09 06:04:39 +00:00
|
|
|
log.debug("Loading 'config.json'...")
|
2021-05-09 05:51:34 +00:00
|
|
|
f = open("config.json", "r")
|
|
|
|
config = json.load(f)
|
|
|
|
f.close()
|
|
|
|
except IOError as e:
|
|
|
|
# Doesn't exist yet
|
|
|
|
log.warn("Couldn't open 'config.json'")
|
|
|
|
log.warn("%s"%e)
|
|
|
|
log.info("Creating default configuration file...")
|
2021-05-09 06:04:39 +00:00
|
|
|
config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23}
|
2021-05-09 05:51:34 +00:00
|
|
|
f = open("config.json", "w")
|
|
|
|
json.dump(config, f, indent="\t")
|
|
|
|
f.close()
|
|
|
|
log.info("Done. Please edit 'config.json' before running Shammer again!")
|
|
|
|
exit()
|
|
|
|
except Exception as e:
|
|
|
|
# Other error
|
|
|
|
log.warn("Couldn't open 'config.json'")
|
|
|
|
log.warn("%s"%e)
|
|
|
|
exit()
|
|
|
|
|
2021-05-08 15:43:31 +00:00
|
|
|
# Init static vars
|
2021-05-09 05:51:34 +00:00
|
|
|
INDEX_URL = config["INDEX_URL"]
|
|
|
|
MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"]
|
|
|
|
REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
|
2021-05-09 06:04:39 +00:00
|
|
|
HOUR_START = config["HOUR_START"]
|
|
|
|
HOUR_STOP = config["HOUR_STOP"]
|
2021-05-08 15:43:31 +00:00
|
|
|
|
|
|
|
# Create session
|
|
|
|
session = requests.Session()
|
|
|
|
|
|
|
|
# Create website queue
|
|
|
|
urls = queue.Queue()
|
|
|
|
urls.put(INDEX_URL)
|
|
|
|
|
2021-05-08 16:00:49 +00:00
|
|
|
# Create list to store visited sites
|
|
|
|
visited = set([INDEX_URL])
|
|
|
|
|
2021-05-08 15:43:31 +00:00
|
|
|
# Loop
|
|
|
|
while not urls.empty():
|
2021-05-09 06:04:39 +00:00
|
|
|
|
|
|
|
# Check time
|
|
|
|
hour = datetime.datetime.now().hour
|
|
|
|
if not (hour >= HOUR_START and hour < HOUR_STOP):
|
|
|
|
log.debug("Not right time yet, sleeping")
|
|
|
|
time.sleep(60)
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Get next url from queue
|
2021-05-08 15:43:31 +00:00
|
|
|
url = urls.get()
|
|
|
|
|
|
|
|
# Perform request
|
2021-05-08 17:58:05 +00:00
|
|
|
log.info("Fetching '%s'..."%url)
|
2021-05-08 16:15:32 +00:00
|
|
|
try:
|
2021-05-08 19:07:00 +00:00
|
|
|
r = session.get(url, stream=True, timeout=5)
|
2021-05-08 19:15:33 +00:00
|
|
|
|
|
|
|
# Read response
|
|
|
|
if r.status_code != 200:
|
|
|
|
log.info("returned status %d"%r.status_code)
|
|
|
|
continue
|
2021-05-08 18:58:00 +00:00
|
|
|
|
|
|
|
# Check file size
|
|
|
|
if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
|
|
|
|
log.info("too large response")
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Download full
|
2021-05-08 19:07:00 +00:00
|
|
|
r = session.get(url, timeout=5)
|
2021-05-08 18:58:00 +00:00
|
|
|
|
2021-05-08 16:15:32 +00:00
|
|
|
except Exception as e:
|
2021-05-08 17:58:05 +00:00
|
|
|
log.info("failed")
|
|
|
|
log.info(e)
|
2021-05-08 16:15:32 +00:00
|
|
|
finally:
|
|
|
|
r.close()
|
2021-05-08 15:43:31 +00:00
|
|
|
|
|
|
|
# Filter page for hrefs
|
2021-05-08 18:31:44 +00:00
|
|
|
hrefs = []
|
|
|
|
try:
|
|
|
|
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
|
2021-05-08 19:15:33 +00:00
|
|
|
pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
|
2021-05-08 18:31:44 +00:00
|
|
|
hrefs.extend(pages)
|
|
|
|
except Exception as e:
|
|
|
|
log.info("failed to parse page")
|
|
|
|
log.info(e)
|
2021-05-08 15:43:31 +00:00
|
|
|
|
|
|
|
# Add to queue
|
2021-05-08 16:00:49 +00:00
|
|
|
numAdded = 0
|
2021-05-08 15:43:31 +00:00
|
|
|
for href in hrefs:
|
2021-05-08 16:00:49 +00:00
|
|
|
if href not in visited:
|
|
|
|
urls.put(href)
|
|
|
|
visited.add(href)
|
|
|
|
numAdded += 1
|
2021-05-08 17:58:05 +00:00
|
|
|
log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))
|
2021-05-08 15:43:31 +00:00
|
|
|
|
2021-05-08 19:28:13 +00:00
|
|
|
# Wait random time
|
2021-05-09 06:04:39 +00:00
|
|
|
if REQUESTS_PER_MINUTE != 0:
|
|
|
|
l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE)
|
|
|
|
time.sleep(l)
|
2021-05-08 19:28:13 +00:00
|
|
|
|