Shammer/main.py

54 lines
1.2 KiB
Python
Raw Normal View History

2021-05-08 17:43:31 +02:00
import requests
import queue
import re
2021-05-08 19:58:05 +02:00
import logging as log
# Init logging
log.basicConfig(level=log.INFO, format="%(asctime)-15s %(levelname)-8s %(message)s")
log.info("Shammer is getting ready...")
2021-05-08 17:43:31 +02:00
# Init static vars
INDEX_URL = "https://bitscuit.be/"
# Create session
session = requests.Session()
# Create website queue
urls = queue.Queue()
urls.put(INDEX_URL)
2021-05-08 18:00:49 +02:00
# Create list to store visited sites
visited = set([INDEX_URL])
2021-05-08 17:43:31 +02:00
# Loop
while not urls.empty():
url = urls.get()
# Perform request
2021-05-08 19:58:05 +02:00
log.info("Fetching '%s'..."%url)
2021-05-08 18:15:32 +02:00
try:
r = session.get(url)
except Exception as e:
2021-05-08 19:58:05 +02:00
log.info("failed")
log.info(e)
2021-05-08 18:15:32 +02:00
finally:
r.close()
2021-05-08 17:43:31 +02:00
# Read response
if r.status_code != 200:
2021-05-08 19:58:05 +02:00
log.info("returned status %d"%r.status_code)
2021-05-08 17:43:31 +02:00
continue
# Filter page for hrefs
hrefs = [res[0] for res in re.findall("(https*:\/\/([a-z0-9]+\.)*[a-z0-9]{3,}\.[a-z0-9]{2,}\/)", r.text)]
# Add to queue
2021-05-08 18:00:49 +02:00
numAdded = 0
2021-05-08 17:43:31 +02:00
for href in hrefs:
2021-05-08 18:00:49 +02:00
if href not in visited:
urls.put(href)
visited.add(href)
numAdded += 1
2021-05-08 19:58:05 +02:00
log.info("%d urls, %d new, queue length %d"%(len(hrefs), numAdded, urls.qsize()))
2021-05-08 17:43:31 +02:00