Added regex timout and optimized regex
This commit is contained in:
parent
d88f95abfa
commit
2d399a5325
13
main.py
13
main.py
|
@ -40,7 +40,7 @@ except IOError as e:
|
||||||
log.warn("Couldn't open 'config.json'")
|
log.warn("Couldn't open 'config.json'")
|
||||||
log.warn("%s"%e)
|
log.warn("%s"%e)
|
||||||
log.info("Creating default configuration file...")
|
log.info("Creating default configuration file...")
|
||||||
config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000}
|
config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000, "REGEX_TIMEOUT":10}
|
||||||
f = open("config.json", "w")
|
f = open("config.json", "w")
|
||||||
json.dump(config, f, indent="\t")
|
json.dump(config, f, indent="\t")
|
||||||
f.close()
|
f.close()
|
||||||
|
@ -59,6 +59,7 @@ REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
|
||||||
HOUR_START = config["HOUR_START"]
|
HOUR_START = config["HOUR_START"]
|
||||||
HOUR_STOP = config["HOUR_STOP"]
|
HOUR_STOP = config["HOUR_STOP"]
|
||||||
REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"]
|
REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"]
|
||||||
|
REGEX_TIMEOUT = config["REGEX_TIMEOUT"]
|
||||||
|
|
||||||
# Create session
|
# Create session
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
@ -114,12 +115,19 @@ while not urls.empty():
|
||||||
r.close()
|
r.close()
|
||||||
|
|
||||||
# Filter page for hrefs
|
# Filter page for hrefs
|
||||||
|
timeStart = time.perf_counter()
|
||||||
hrefs = []
|
hrefs = []
|
||||||
try:
|
try:
|
||||||
for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE):
|
for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE):
|
||||||
|
# Check if timeout
|
||||||
|
if time.perf_counter() - timeStart > REGEX_TIMEOUT:
|
||||||
|
log.info("regex timeout")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Apply regex
|
||||||
log.debug("regex on chunk %d"%chunkIndex)
|
log.debug("regex on chunk %d"%chunkIndex)
|
||||||
chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))]
|
chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))]
|
||||||
hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"'](\/?([a-z0-9]*\/?)+(\.[a-z0-9]*)?)[\"']", chunk)])
|
hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"']([a-z0-9\/]+(\.[a-z0-9]*)?)[\"']", chunk)])
|
||||||
hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)])
|
hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.info("failed to parse page")
|
log.info("failed to parse page")
|
||||||
|
@ -139,3 +147,4 @@ while not urls.empty():
|
||||||
l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE)
|
l = random.uniform(0, 2*60/REQUESTS_PER_MINUTE)
|
||||||
time.sleep(l)
|
time.sleep(l)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue