Added regex timout and optimized regex

This commit is contained in:
bitscuit 2021-05-09 13:16:01 +02:00
parent d88f95abfa
commit 2d399a5325
1 changed files with 11 additions and 2 deletions

13
main.py
View File

@ -40,7 +40,7 @@ except IOError as e:
log.warn("Couldn't open 'config.json'") log.warn("Couldn't open 'config.json'")
log.warn("%s"%e) log.warn("%s"%e)
log.info("Creating default configuration file...") log.info("Creating default configuration file...")
config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000} config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000, "REGEX_TIMEOUT":10}
f = open("config.json", "w") f = open("config.json", "w")
json.dump(config, f, indent="\t") json.dump(config, f, indent="\t")
f.close() f.close()
@ -59,6 +59,7 @@ REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
HOUR_START = config["HOUR_START"] HOUR_START = config["HOUR_START"]
HOUR_STOP = config["HOUR_STOP"] HOUR_STOP = config["HOUR_STOP"]
REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"] REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"]
REGEX_TIMEOUT = config["REGEX_TIMEOUT"]
# Create session # Create session
session = requests.Session() session = requests.Session()
@ -114,12 +115,19 @@ while not urls.empty():
r.close() r.close()
# Filter page for hrefs # Filter page for hrefs
timeStart = time.perf_counter()
hrefs = [] hrefs = []
try: try:
for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE): for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE):
# Check if timeout
if time.perf_counter() - timeStart > REGEX_TIMEOUT:
log.info("regex timeout")
break
# Apply regex
log.debug("regex on chunk %d"%chunkIndex) log.debug("regex on chunk %d"%chunkIndex)
chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))] chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))]
hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"'](\/?([a-z0-9]*\/?)+(\.[a-z0-9]*)?)[\"']", chunk)]) hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"']([a-z0-9\/]+(\.[a-z0-9]*)?)[\"']", chunk)])
hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)]) hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)])
except Exception as e: except Exception as e:
log.info("failed to parse page") log.info("failed to parse page")
@ -139,3 +147,4 @@ while not urls.empty():
l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE) l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE)
time.sleep(l) time.sleep(l)