# NOTE: Used for requesting web pages import requests # NOTE: Used for parsing web pages from bs4 import BeautifulSoup # NOTE: Generic imports import base64 import mimetypes import logging logger = logging.getLogger("__main__." + __name__) def downloadFile(url, text=True, mimeType=None): """ Download resource from url and convert it to text or a data url """ fbytes = b"" with requests.get(url, stream=True) as r: r.raise_for_status() for chunk in r.iter_content(chunk_size=8192): fbytes += chunk if text: return fbytes.decode("utf-8") else: if mimeType == None: mimeType, encoding = mimetypes.guess_type(url) if mimeType == None: raise ValueError( "Couldnt guess mime type and none was supplied, cant encode to data url" ) b64str = base64.b64encode(fbytes).decode("utf-8") dataUrl = "data:{0};base64,{1}".format(mimeType, b64str) return dataUrl def get(url: str, params=None, followTags=None): """ http/s get request Parameters ---------- url: str params Requests (library) parameters followTags None or list of tags to download the src/href from """ logger.debug("Hopping to it") # TODO: Non blocking requests # WARN: Do not run self requests until this is fixed r = requests.get(url, params=params) logger.debug("Content retrieved, parsing") r = { "response": r.text, "code": r.status_code, "content-type": r.headers.get("content-type"), } logger.debug("Done parsing") # TODO: Reject followtags if content type is other then html if followTags != None: soup = BeautifulSoup(r["response"], "html.parser") # TODO: Checking for relative links for tag in followTags: if tag in ["img", "video"]: for elem in soup.find_all(tag): elem["src"] = downloadFile(elem["src"], text=False) elif tag in ["link"]: for elem in soup.find_all(tag): if elem["rel"] == "stylesheet": style = downloadFile(elem["href"]) elem.decompose() soup.head.append_tag(soup.new_tag("style", string=style)) elif tag == "script": for elem in soup.find_all(tag): script = downloadFile(elem["src"]) elem["src"] = "" elem.string = script r["response"] = soup.text logger.debug("Done hopping") return r def post(url: str, params=None): """ http/s post request Parameters ---------- url: str params Requests (library) parameters """ r = requests.post(url, data=params) r = {"response": r.text, "code": r.status_code} return r