Wrote first version
This commit is contained in:
		
							parent
							
								
									b3e781bd98
								
							
						
					
					
						commit
						3866b2be0f
					
				
							
								
								
									
										37
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								main.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | import requests | ||||||
|  | import queue | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | # Init static vars | ||||||
|  | INDEX_URL = "https://bitscuit.be/" | ||||||
|  | 
 | ||||||
|  | # Create session | ||||||
|  | session = requests.Session() | ||||||
|  | 
 | ||||||
|  | # Create website queue | ||||||
|  | urls = queue.Queue() | ||||||
|  | urls.put(INDEX_URL) | ||||||
|  | 
 | ||||||
|  | # Loop | ||||||
|  | while not urls.empty(): | ||||||
|  |     url = urls.get() | ||||||
|  | 
 | ||||||
|  |     # Perform request | ||||||
|  |     print("Fetching url '%s'..."%url, end="") | ||||||
|  |     r = session.get(url) | ||||||
|  |     print("\tdone") | ||||||
|  | 
 | ||||||
|  |     # Read response | ||||||
|  |     if r.status_code != 200: | ||||||
|  |         print("returned %d"%r.status_code) | ||||||
|  |         continue | ||||||
|  | 
 | ||||||
|  |     # Filter page for hrefs | ||||||
|  |     hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)] | ||||||
|  |     print(hrefs) | ||||||
|  | 
 | ||||||
|  |     # Add to queue | ||||||
|  |     print("adding %d new urls"%len(hrefs)) | ||||||
|  |     for href in hrefs: | ||||||
|  |         urls.put(href) | ||||||
|  | 
 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user