diff --git a/main.py b/main.py index 4cfbca7..653092d 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ except IOError as e: log.warn("Couldn't open 'config.json'") log.warn("%s"%e) log.info("Creating default configuration file...") - config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000} + config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000, "REGEX_TIMEOUT":10} f = open("config.json", "w") json.dump(config, f, indent="\t") f.close() @@ -59,6 +59,7 @@ REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"] HOUR_START = config["HOUR_START"] HOUR_STOP = config["HOUR_STOP"] REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"] +REGEX_TIMEOUT = config["REGEX_TIMEOUT"] # Create session session = requests.Session() @@ -114,12 +115,19 @@ while not urls.empty(): r.close() # Filter page for hrefs + timeStart = time.perf_counter() hrefs = [] try: for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE): + # Check if timeout + if time.perf_counter() - timeStart > REGEX_TIMEOUT: + log.info("regex timeout") + break + + # Apply regex log.debug("regex on chunk %d"%chunkIndex) chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))] - hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"'](\/?([a-z0-9]*\/?)+(\.[a-z0-9]*)?)[\"']", chunk)]) + hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"']([a-z0-9\/]+(\.[a-z0-9]*)?)[\"']", chunk)]) hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)]) except Exception as e: log.info("failed to parse page") @@ -139,3 +147,4 @@ while not urls.empty(): l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE) time.sleep(l) +