54 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import requests
 | |
| import queue
 | |
| import re
 | |
| import logging as log
 | |
| 
 | |
| # Init logging
 | |
| log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s")
 | |
| log.info("Shammer is getting ready...")
 | |
| 
 | |
| # Init static vars
 | |
| INDEX_URL = "https://bitscuit.be/"
 | |
| 
 | |
| # Create session
 | |
| session = requests.Session()
 | |
| 
 | |
| # Create website queue
 | |
| urls = queue.Queue()
 | |
| urls.put(INDEX_URL)
 | |
| 
 | |
| # Create list to store visited sites
 | |
| visited = set([INDEX_URL])
 | |
| 
 | |
| # Loop
 | |
| while not urls.empty():
 | |
|     url = urls.get()
 | |
| 
 | |
|     # Perform request
 | |
|     log.info("Fetching '%s'..."%url)
 | |
|     try:
 | |
|         r = session.get(url)
 | |
|     except Exception as e:
 | |
|         log.info("failed")
 | |
|         log.info(e)
 | |
|     finally:
 | |
|         r.close()
 | |
| 
 | |
|     # Read response
 | |
|     if r.status_code != 200:
 | |
|         log.info("returned status %d"%r.status_code)
 | |
|         continue
 | |
| 
 | |
|     # Filter page for hrefs
 | |
|     hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
 | |
| 
 | |
|     # Add to queue
 | |
|     numAdded = 0
 | |
|     for href in hrefs:
 | |
|         if href not in visited:
 | |
|             urls.put(href)
 | |
|             visited.add(href)
 | |
|             numAdded += 1
 | |
|     log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))
 | |
| 
 |