piermesh/src/Components/hopper.py

# NOTE: Used for requesting web pages
import requests

# NOTE: Used for parsing web pages
from bs4 import BeautifulSoup

# NOTE: Generic imports
import base64
import mimetypes
import logging

logger = logging.getLogger("__main__." + __name__)


def downloadFile(url, text=True, mimeType=None):
    """
    Download resource from url and convert it to text or a data url
    """
    fbytes = b""
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        for chunk in r.iter_content(chunk_size=8192):
            fbytes += chunk
    if text:
        return fbytes.decode("utf-8")
    else:
        if mimeType == None:
            mimeType, encoding = mimetypes.guess_type(url)
            if mimeType == None:
                raise ValueError(
                    "Couldnt guess mime type and none was supplied, cant encode to data url"
                )
        b64str = base64.b64encode(fbytes).decode("utf-8")
        dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)
        return dataUrl


def get(url: str, params=None, followTags=None):
    """
    http/s get request

    Parameters
    ----------
    url: str

    params
        Requests (library) parameters

    followTags
        None or list of tags to download the src/href from
    """
    logger.debug("Hopping to it")
    # TODO: Non blocking requests
    # WARN: Do not run self requests until this is fixed
    r = requests.get(url, params=params)
    logger.debug("Content retrieved, parsing")
    r = {
        "response": r.text,
        "code": r.status_code,
        "content-type": r.headers.get("content-type"),
    }
    logger.debug("Done parsing")

    # TODO: Reject followtags if content type is other then html
    if followTags != None:
        soup = BeautifulSoup(r["response"], "html.parser")
        # TODO: Checking for relative links
        for tag in followTags:
            if tag in ["img", "video"]:
                for elem in soup.find_all(tag):
                    elem["src"] = downloadFile(elem["src"], text=False)
            elif tag in ["link"]:
                for elem in soup.find_all(tag):
                    if elem["rel"] == "stylesheet":
                        style = downloadFile(elem["href"])
                        elem.decompose()
                        soup.head.append_tag(soup.new_tag("style", string=style))
            elif tag == "script":
                for elem in soup.find_all(tag):
                    script = downloadFile(elem["src"])
                    elem["src"] = ""
                    elem.string = script
        r["response"] = soup.text
    logger.debug("Done hopping")
    return r


def post(url: str, params=None):
    """
    http/s post request

    Parameters
    ----------
    url: str

    params
        Requests (library) parameters
    """
    r = requests.post(url, data=params)
    r = {"response": r.text, "code": r.status_code}
    return r