piermesh/src/Components/hopper.py

102 lines
2.9 KiB
Python
Executable File

# NOTE: Used for requesting web pages
import requests
# NOTE: Used for parsing web pages
from bs4 import BeautifulSoup
# NOTE: Generic imports
import base64
import mimetypes
import logging
logger = logging.getLogger("__main__." + __name__)
def downloadFile(url, text=True, mimeType=None):
"""
Download resource from url and convert it to text or a data url
"""
fbytes = b""
with requests.get(url, stream=True) as r:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
fbytes += chunk
if text:
return fbytes.decode("utf-8")
else:
if mimeType == None:
mimeType, encoding = mimetypes.guess_type(url)
if mimeType == None:
raise ValueError(
"Couldnt guess mime type and none was supplied, cant encode to data url"
)
b64str = base64.b64encode(fbytes).decode("utf-8")
dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)
return dataUrl
def get(url: str, params=None, followTags=None):
"""
http/s get request
Parameters
----------
url: str
params
Requests (library) parameters
followTags
None or list of tags to download the src/href from
"""
logger.debug("Hopping to it")
# TODO: Non blocking requests
# WARN: Do not run self requests until this is fixed
r = requests.get(url, params=params)
logger.debug("Content retrieved, parsing")
r = {
"response": r.text,
"code": r.status_code,
"content-type": r.headers.get("content-type"),
}
logger.debug("Done parsing")
# TODO: Reject followtags if content type is other then html
if followTags != None:
soup = BeautifulSoup(r["response"], "html.parser")
# TODO: Checking for relative links
for tag in followTags:
if tag in ["img", "video"]:
for elem in soup.find_all(tag):
elem["src"] = downloadFile(elem["src"], text=False)
elif tag in ["link"]:
for elem in soup.find_all(tag):
if elem["rel"] == "stylesheet":
style = downloadFile(elem["href"])
elem.decompose()
soup.head.append_tag(soup.new_tag("style", string=style))
elif tag == "script":
for elem in soup.find_all(tag):
script = downloadFile(elem["src"])
elem["src"] = ""
elem.string = script
r["response"] = soup.text
logger.debug("Done hopping")
return r
def post(url: str, params=None):
"""
http/s post request
Parameters
----------
url: str
params
Requests (library) parameters
"""
r = requests.post(url, data=params)
r = {"response": r.text, "code": r.status_code}
return r