from requests.exceptions import JSONDecodeError as JSONDecodeError
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from typing import Union, Any, Dict, List, Literal, Optional, Tuple
import pandas as pd
import requests
[docs]
def clean_html(html_string: str) -> str:
"""
Cleans the given HTML string by removing HTML tags and replacing special characters.
Args:
html_string (str): The input HTML string to be cleaned.
Returns:
str: The cleaned text content without HTML tags and special characters.
"""
assert isinstance(html_string, str), "The input must be a string"
assert html_string != "", "The input string cannot be empty"
soup = BeautifulSoup(html_string, "html.parser")
text_content = soup.get_text(separator=" ", strip=True)
text_content = text_content.replace("\xa0", " ")
return text_content
[docs]
def convertMillis(millis: int) -> str:
"""
Convert milliseconds to a formatted date string.
Args:
millis (int): The number of milliseconds to convert.
Returns:
str: The formatted date string in the format 'YYYY-MM-DD'.
Raises:
AssertionError: If the input is not an integer.
"""
assert isinstance(millis, int), "The input must be an integer"
return datetime.fromtimestamp(millis / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
[docs]
def parse_response(response: requests.Response) -> List[dict[Any, Any]]:
"""
Response will be either JSON or bytes
"""
content_type: str = response.headers.get("content-type", "")
assert (
response.status_code == 200
), "The response status code must be 200 OK but is {}".format(response.status_code)
if not "application/json" in content_type:
if response.content == b"":
raise ValueError(
"The response content is empty. "
"Check your request parameters and try again."
)
else:
raise ValueError(
"The response content type must be 'application/json' but is {}".format(
content_type
)
)
try:
response_list = response.json()
except JSONDecodeError:
raise JSONDecodeError("Failed to decode JSON response")
else:
if not isinstance(response_list, list):
raise TypeError(
"The JSON response must be a dictionary but is a {}".format(
type(response_list)
)
)
return response_list