Optimized regex
This commit is contained in:
		
							parent
							
								
									f40b2c809b
								
							
						
					
					
						commit
						d88f95abfa
					
				
							
								
								
									
										16
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								main.py
									
									
									
									
									
								
							@ -40,7 +40,7 @@ except IOError as e:
 | 
				
			|||||||
    log.warn("Couldn't open 'config.json'")
 | 
					    log.warn("Couldn't open 'config.json'")
 | 
				
			||||||
    log.warn("%s"%e)
 | 
					    log.warn("%s"%e)
 | 
				
			||||||
    log.info("Creating default configuration file...")
 | 
					    log.info("Creating default configuration file...")
 | 
				
			||||||
    config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23}
 | 
					    config = {"INDEX_URL":"https://bitscuit.be/", "MAX_CONTENT_LENGTH":500000, "REQUESTS_PER_MINUTE":10, "HOUR_START":7, "HOUR_STOP":23, "REGEX_CHUNK_SIZE":5000}
 | 
				
			||||||
    f = open("config.json", "w")
 | 
					    f = open("config.json", "w")
 | 
				
			||||||
    json.dump(config, f, indent="\t")
 | 
					    json.dump(config, f, indent="\t")
 | 
				
			||||||
    f.close()
 | 
					    f.close()
 | 
				
			||||||
@ -58,6 +58,7 @@ MAX_CONTENT_LENGTH = config["MAX_CONTENT_LENGTH"]
 | 
				
			|||||||
REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
 | 
					REQUESTS_PER_MINUTE = config["REQUESTS_PER_MINUTE"]
 | 
				
			||||||
HOUR_START = config["HOUR_START"]
 | 
					HOUR_START = config["HOUR_START"]
 | 
				
			||||||
HOUR_STOP = config["HOUR_STOP"]
 | 
					HOUR_STOP = config["HOUR_STOP"]
 | 
				
			||||||
 | 
					REGEX_CHUNK_SIZE = config["REGEX_CHUNK_SIZE"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Create session
 | 
					# Create session
 | 
				
			||||||
session = requests.Session()
 | 
					session = requests.Session()
 | 
				
			||||||
@ -98,6 +99,11 @@ while not urls.empty():
 | 
				
			|||||||
            log.info("too large response")
 | 
					            log.info("too large response")
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Check file type
 | 
				
			||||||
 | 
					        if ("Content-Type" in r.headers.keys() and not r.headers["Content-Type"].startswith("text/")) or ("content-type" in r.headers.keys() and not r.headers["content-type"].startswith("text/")):
 | 
				
			||||||
 | 
					            log.info("response not text")
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Download full
 | 
					        # Download full
 | 
				
			||||||
        r = session.get(url, timeout=5)
 | 
					        r = session.get(url, timeout=5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -110,9 +116,11 @@ while not urls.empty():
 | 
				
			|||||||
    # Filter page for hrefs
 | 
					    # Filter page for hrefs
 | 
				
			||||||
    hrefs = []
 | 
					    hrefs = []
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
 | 
					        for chunkIndex in range(0, len(r.text), REGEX_CHUNK_SIZE):
 | 
				
			||||||
        pages = [(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href *= *[\"'](\/*([a-z0-9]*\/*)+(\.[a-z0-9]*)*)[\"']", r.text)]
 | 
					            log.debug("regex on chunk %d"%chunkIndex)
 | 
				
			||||||
        hrefs.extend(pages)
 | 
					            chunk = r.text[chunkIndex*REGEX_CHUNK_SIZE:min((chunkIndex+1)*REGEX_CHUNK_SIZE, len(r.text))]
 | 
				
			||||||
 | 
					            hrefs.extend([(url[:(url.rfind("/")+1 if res[0][0] != "/" else url.find("/", 10))]+res[0]) for res in re.findall("href ?= ?[\"'](\/?([a-z0-9]*\/?)+(\.[a-z0-9]*)?)[\"']", chunk)])
 | 
				
			||||||
 | 
					            hrefs.extend([res[0] for res in re.findall("(https?:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", chunk)])
 | 
				
			||||||
    except Exception as e:
 | 
					    except Exception as e:
 | 
				
			||||||
        log.info("failed to parse page")
 | 
					        log.info("failed to parse page")
 | 
				
			||||||
        log.info(e)
 | 
					        log.info(e)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user