aboutsummaryrefslogtreecommitdiff
path: root/autogpts/autogpt/autogpt/processing/html.py
diff options
context:
space:
mode:
Diffstat (limited to 'autogpts/autogpt/autogpt/processing/html.py')
-rw-r--r--autogpts/autogpt/autogpt/processing/html.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/autogpts/autogpt/autogpt/processing/html.py b/autogpts/autogpt/autogpt/processing/html.py
new file mode 100644
index 000000000..73c65b9c9
--- /dev/null
+++ b/autogpts/autogpt/autogpt/processing/html.py
@@ -0,0 +1,33 @@
+"""HTML processing functions"""
+from __future__ import annotations
+
+from bs4 import BeautifulSoup
+from requests.compat import urljoin
+
+
+def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> list[tuple[str, str]]:
+ """Extract hyperlinks from a BeautifulSoup object
+
+ Args:
+ soup (BeautifulSoup): The BeautifulSoup object
+ base_url (str): The base URL
+
+ Returns:
+ List[Tuple[str, str]]: The extracted hyperlinks
+ """
+ return [
+ (link.text, urljoin(base_url, link["href"]))
+ for link in soup.find_all("a", href=True)
+ ]
+
+
+def format_hyperlinks(hyperlinks: list[tuple[str, str]]) -> list[str]:
+ """Format hyperlinks to be displayed to the user
+
+ Args:
+ hyperlinks (List[Tuple[str, str]]): The hyperlinks to format
+
+ Returns:
+ List[str]: The formatted hyperlinks
+ """
+ return [f"{link_text.strip()} ({link_url})" for link_text, link_url in hyperlinks]