diff --git a/main.py b/main.py index 7816e77..4cfbca7 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ except IOError as e: log.warn("Couldn't open 'config.json'") log.warn("%s"%e) log.info("Creating default configuration file...") - config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23} + config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000} f = open("config.json", "w") json.dump(config, f, indent="\t") f.close() @@ -58,6 +58,7 @@ MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"] REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"] HOUR_START = config["HOUR_START"] HOUR_STOP = config["HOUR_STOP"] +REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"] # Create session session = requests.Session() @@ -98,6 +99,11 @@ while not urls.empty(): log.info("too large response") continue + # Check file type + if ("Content-Type" in r.headers.keys() and not r.headers["Content-Type"].startswith("text/")) or ("content-type" in r.headers.keys() and not r.headers["content-type"].startswith("text/")): + log.info("response not text") + continue + # Download full r = session.get(url, timeout=5) @@ -110,9 +116,11 @@ while not urls.empty(): # Filter page for hrefs hrefs = [] try: - hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] - pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)] - hrefs.extend(pages) + for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE): + log.debug("regex on chunk %d"%chunkIndex) + chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))] + hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"'](\/?([a-z0-9]*\/?)+(\.[a-z0-9]*)?)[\"']", chunk)]) + hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)]) except Exception as e: log.info("failed to parse page") log.info(e)