Shammer/main.py

#
# Copyright (C) 2021 Thomas Van Acker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.""")
#
# Shammer is hosted on Gitscuit: <https://git.bitscuit.be/bitscuit/Shammer>
#

import requests
import queue
import re
import logging as log
import time
import random
import json
import datetime

# Init logging
log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s")
log.info("Shammer is getting ready...")

# Init config
try:
    log.debug("Loading 'config.json'...")
    f = open("config.json", "r")
    config = json.load(f)
    f.close()
except IOError as e:
    # Doesn't exist yet
    log.warn("Couldn't open 'config.json'")
    log.warn("%s"%e)
    log.info("Creating default configuration file...")
    config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000, "REGEX_TIMEOUT":10}
    f = open("config.json", "w")
    json.dump(config, f, indent="\t")
    f.close()
    log.info("Done. Please edit 'config.json' before running Shammer again!")
    exit()
except Exception as e:
    # Other error
    log.warn("Couldn't open 'config.json'")
    log.warn("%s"%e)
    exit()

# Init static vars
INDEX_URL = config["INDEX_URL"]
MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"]
REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
HOUR_START = config["HOUR_START"]
HOUR_STOP = config["HOUR_STOP"]
REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"]
REGEX_TIMEOUT = config["REGEX_TIMEOUT"]

# Create session
session = requests.Session()

# Create website queue
urls = queue.Queue()
urls.put(INDEX_URL)

# Create list to store visited sites
visited = set([INDEX_URL])
numFetched = 0

# Loop
while not urls.empty():

    # Check time
    hour = datetime.datetime.now().hour
    if not (hour >= HOUR_START and hour < HOUR_STOP):
        log.debug("Not right time yet, sleeping")
        time.sleep(60)
        continue

    # Get next url from queue
    url = urls.get()

    # Perform request
    log.info("Fetch %d: '%s'..."%(numFetched, url))
    numFetched += 1
    try:
        r = session.get(url, stream=True, timeout=5)
        r.close()

        # Read response
        if r.status_code != 200:
            log.info("returned status %d"%r.status_code)
            continue

        # Check file size
        if ("Content-Length" in r.headers.keys() and int(r.headers["Content-Length"]) > MAX_CONTENT_LENGTH) or ("content-length" in r.headers.keys() and int(r.headers["content-length"]) > MAX_CONTENT_LENGTH):
            log.info("too large response")
            continue

        # Check file type
        if ("Content-Type" in r.headers.keys() and not r.headers["Content-Type"].startswith("text/")) or ("content-type" in r.headers.keys() and not r.headers["content-type"].startswith("text/")):
            log.info("response not text")
            continue

        # Download full
        r = session.get(url, timeout=5)

    except Exception as e:
        log.info("failed")
        log.info(e)
    finally:
        r.close()

    # Filter page for hrefs
    timeStart = time.perf_counter()
    hrefs = []
    try:
        for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE):
            # Check if timeout
            if time.perf_counter() - timeStart > REGEX_TIMEOUT:
                log.info("regex timeout")
                break

            # Apply regex
            log.debug("regex on chunk %d"%chunkIndex)
            chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))]
            hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"']([a-z0-9\/]+(\.[a-z0-9]*)?)[\"']", chunk)])
            hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)])
    except Exception as e:
        log.info("failed to parse page")
        log.info(e)

    # Add to queue
    numAdded = 0
    for href in hrefs:
        if href not in visited:
            urls.put(href)
            visited.add(href)
            numAdded += 1
    log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))

    # Wait random time
    if REQUESTS_PER_MINUTE != 0:
        l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE)
        time.sleep(l)