Data Ingestion Improvement/Cleanup/Bug Fix - Part 2 (#307)

Code ready for review, evaluations/testing is still in progress ### Description - This is **part 2** of the data ingestion improvements. The goal is to analyze and investigate any potential issues with the data ingestion process, remove noisy data such as badly formatted content or gibberish text, find bugs, and enforce standards on the process to have predictable cost estimation/reduction. ### Technical Changes - Renamed the original `split.py` file to `chunking_utils.py` (multiple files changed due to import naming) - During the above change, found 2 DAGs somehow were NOT changed. Provider docs and SDK docs's incremental ingest DAGs somehow never had a chunking/splitting task (even though the bulk ingest does). I added chunking to these 2 DAGs - Airflow Docs extraction process has been improved(see code for details) - Provider Docs extraction process has been improved (see code for details) - Astro SDK Docs extraction process has been improved (see code for details) -> Astro SDK docs are not currently being ingested, but fixing code since they exist. - Provider docs used to ingest all past versions leading to many duplicate docs in the DB. It is now changes to only ingest the `stable` version of the docs. ### Evaluations - Answer quality generally improved with some add on of more concise answering and linking hyperlinks when the model knows the answer is not exhaustive. - Please attached for a quick list of results - [data_ingest_comparison_part_2.csv](https://github.com/astronomer/ask-astro/files/14484682/data_ingest_comparison_part_2.csv) [data_ingest_results_part_2.csv](https://github.com/astronomer/ask-astro/files/14484683/data_ingest_results_part_2.csv) - Bulk ingest also runs with no issues <img width="668" alt="image" src="https://github.com/astronomer/ask-astro/assets/26350341/635c1ba0-b6b0-4fcd-81ed-8c337f7b3f78"> ### Related Issues closes #221 closes #258 closes #295 (Reranker has been addressed in GCP environment variables, embedding model change completed in a different PR) closes #285 (This PR prevents empty docs from being ingested)
astronomer · Mar 4, 2024 · df80ddc · df80ddc
1 parent d8b0e1d
commit df80ddc
Show file tree

Hide file tree

Showing 19 changed files with 139 additions and 56 deletions.
diff --git a/airflow/Dockerfile b/airflow/Dockerfile
@@ -1 +1 @@
-FROM quay.io/astronomer/astro-runtime:10.0.0
+FROM quay.io/astronomer/astro-runtime:10.1.0
diff --git a/airflow/dags/ingestion/ask-astro-forum-load.py b/airflow/dags/ingestion/ask-astro-forum-load.py
@@ -36,9 +36,9 @@ def get_astro_forum_content():
     ),
 )
 def ask_astro_load_astro_forum():
-    from include.tasks import split
+    from include.tasks import chunking_utils
 
-    split_docs = task(split.split_html).expand(dfs=[get_astro_forum_content()])
+    split_docs = task(chunking_utils.split_html).expand(dfs=[get_astro_forum_content()])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-airflow-docs.py b/airflow/dags/ingestion/ask-astro-load-airflow-docs.py
@@ -35,10 +35,10 @@ def ask_astro_load_airflow_docs():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import airflow_docs
 
-    extracted_airflow_docs = task(split.split_html).expand(
+    extracted_airflow_docs = task(chunking_utils.split_html).expand(
         dfs=[airflow_docs.extract_airflow_docs(docs_base_url=airflow_docs_base_url)]
     )
 

diff --git a/airflow/dags/ingestion/ask-astro-load-astro-cli.py b/airflow/dags/ingestion/ask-astro-load-astro-cli.py
@@ -32,11 +32,11 @@ def ask_astro_load_astro_cli_docs():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import astro_cli_docs
 
     extract_astro_cli_docs = task(astro_cli_docs.extract_astro_cli_docs)()
-    split_md_docs = task(split.split_html).expand(dfs=[extract_astro_cli_docs])
+    split_md_docs = task(chunking_utils.split_html).expand(dfs=[extract_astro_cli_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-astro-sdk.py b/airflow/dags/ingestion/ask-astro-load-astro-sdk.py
@@ -37,6 +37,10 @@ def get_astro_sdk_content():
     ),
 )
 def ask_astro_load_astro_sdk():
+    from include.tasks import chunking_utils
+
+    split_docs = task(chunking_utils.split_html).expand(dfs=[get_astro_sdk_content()])
+
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,
         existing="replace",
@@ -45,7 +49,7 @@ def ask_astro_load_astro_sdk():
         verbose=True,
         conn_id=_WEAVIATE_CONN_ID,
         task_id="WeaviateDocumentIngestOperator",
-    ).expand(input_data=[get_astro_sdk_content()])
+    ).expand(input_data=[split_docs])
 
 
 ask_astro_load_astro_sdk()
diff --git a/airflow/dags/ingestion/ask-astro-load-astronomer-docs.py b/airflow/dags/ingestion/ask-astro-load-astronomer-docs.py
@@ -31,12 +31,12 @@ def ask_astro_load_astronomer_docs():
     """
     This DAG performs incremental load for any new docs in astronomer docs.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract.astro_docs import extract_astro_docs
 
     astro_docs = task(extract_astro_docs)()
 
-    split_html_docs = task(split.split_html).expand(dfs=[astro_docs])
+    split_html_docs = task(chunking_utils.split_html).expand(dfs=[astro_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-astronomer-provider.py b/airflow/dags/ingestion/ask-astro-load-astronomer-provider.py
@@ -43,6 +43,10 @@ def ask_astro_load_astronomer_providers():
     any existing documents that have been updated will be removed and re-added.
     """
 
+    from include.tasks import chunking_utils
+
+    split_docs = task(chunking_utils.split_html).expand(dfs=[get_provider_content()])
+
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,
         existing="replace",
@@ -51,7 +55,7 @@ def ask_astro_load_astronomer_providers():
         verbose=True,
         conn_id=_WEAVIATE_CONN_ID,
         task_id="WeaviateDocumentIngestOperator",
-    ).expand(input_data=[get_provider_content()])
+    ).expand(input_data=[split_docs])
 
 
 ask_astro_load_astronomer_providers()
diff --git a/airflow/dags/ingestion/ask-astro-load-blogs.py b/airflow/dags/ingestion/ask-astro-load-blogs.py
@@ -34,12 +34,12 @@ def ask_astro_load_blogs():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import blogs
 
     blogs_docs = task(blogs.extract_astro_blogs)(blog_cutoff_date=blog_cutoff_date)
 
-    split_md_docs = task(split.split_markdown).expand(dfs=[blogs_docs])
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[blogs_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-cosmos-docs.py b/airflow/dags/ingestion/ask-astro-load-cosmos-docs.py
@@ -33,9 +33,9 @@ def ask_astro_load_cosmos_docs():
     any existing documents that have been updated will be removed and re-added.
     """
 
-    from include.tasks import split
+    from include.tasks import chunking_utils
 
-    split_docs = task(split.split_html).expand(dfs=[extract_cosmos_docs()])
+    split_docs = task(chunking_utils.split_html).expand(dfs=[extract_cosmos_docs()])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-github.py b/airflow/dags/ingestion/ask-astro-load-github.py
@@ -43,7 +43,7 @@ def ask_astro_load_github():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import github
 
     md_docs = (
@@ -58,7 +58,7 @@ def ask_astro_load_github():
         .expand(repo_base=issues_docs_sources)
     )
 
-    split_md_docs = task(split.split_markdown).expand(dfs=[md_docs, issues_docs])
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[md_docs, issues_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-registry.py b/airflow/dags/ingestion/ask-astro-load-registry.py
@@ -32,16 +32,16 @@ def ask_astro_load_registry():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import registry
 
     registry_cells_docs = task(registry.extract_astro_registry_cell_types)()
 
     registry_dags_docs = task(registry.extract_astro_registry_dags)()
 
-    split_md_docs = task(split.split_markdown).expand(dfs=[registry_cells_docs])
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[registry_cells_docs])
 
-    split_code_docs = task(split.split_python).expand(dfs=[registry_dags_docs])
+    split_code_docs = task(chunking_utils.split_python).expand(dfs=[registry_dags_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-slack.py b/airflow/dags/ingestion/ask-astro-load-slack.py
@@ -42,12 +42,12 @@ def ask_astro_load_slack():
     DAG should run nightly to capture threads between archive periods. By using the upsert logic of the
     weaviate_import decorator any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import slack
 
     slack_docs = task(slack.extract_slack).expand(source=slack_channel_sources)
 
-    split_md_docs = task(split.split_markdown).expand(dfs=[slack_docs])
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[slack_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load-stackoverflow.py b/airflow/dags/ingestion/ask-astro-load-stackoverflow.py
@@ -38,7 +38,7 @@ def ask_astro_load_stackoverflow():
     data from a point-in-time data capture. By using the upsert logic of the weaviate_import decorator
     any existing documents that have been updated will be removed and re-added.
     """
-    from include.tasks import split
+    from include.tasks import chunking_utils
     from include.tasks.extract import stack_overflow
 
     stack_overflow_docs = (
@@ -47,7 +47,7 @@ def ask_astro_load_stackoverflow():
         .expand(tag=stackoverflow_tags)
     )
 
-    split_md_docs = task(split.split_markdown).expand(dfs=[stack_overflow_docs])
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=[stack_overflow_docs])
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/dags/ingestion/ask-astro-load.py b/airflow/dags/ingestion/ask-astro-load.py
@@ -75,7 +75,7 @@ def ask_astro_load_bulk():
 
     """
 
-    from include.tasks import split
+    from include.tasks import chunking_utils
 
     @task
     def get_schema_and_process(schema_file: str) -> list:
@@ -432,11 +432,11 @@ def import_baseline(
 
     python_code_tasks = [registry_dags_docs]
 
-    split_md_docs = task(split.split_markdown).expand(dfs=markdown_tasks)
+    split_md_docs = task(chunking_utils.split_markdown).expand(dfs=markdown_tasks)
 
-    split_code_docs = task(split.split_python).expand(dfs=python_code_tasks)
+    split_code_docs = task(chunking_utils.split_python).expand(dfs=python_code_tasks)
 
-    split_html_docs = task(split.split_html).expand(dfs=html_tasks)
+    split_html_docs = task(chunking_utils.split_html).expand(dfs=html_tasks)
 
     _import_data = WeaviateDocumentIngestOperator.partial(
         class_name=WEAVIATE_CLASS,

diff --git a/airflow/include/tasks/split.py → airflow/include/tasks/chunking_utils.py b/airflow/include/tasks/split.py → airflow/include/tasks/chunking_utils.py
@@ -1,13 +1,28 @@
 from __future__ import annotations
 
+import logging
+
 import pandas as pd
+import tiktoken
 from langchain.schema import Document
 from langchain.text_splitter import (
-    Language,
     RecursiveCharacterTextSplitter,
 )
 from langchain_community.document_transformers import Html2TextTransformer
 
+logger = logging.getLogger("airflow.task")
+
+TARGET_CHUNK_SIZE = 2500
+
+
+def enforce_max_token_len(text: str) -> str:
+    encoding = tiktoken.get_encoding("cl100k_base")
+    encoded_text = encoding.encode(text)
+    if len(encoded_text) > 8191:
+        logger.info("Token length of string exceeds the max content length of the tokenizer. Truncating...")
+        return encoding.decode(encoded_text[:8191])
+    return text
+
 
 def split_markdown(dfs: list[pd.DataFrame]) -> pd.DataFrame:
     """
@@ -27,11 +42,25 @@ def split_markdown(dfs: list[pd.DataFrame]) -> pd.DataFrame:
 
     df = pd.concat(dfs, axis=0, ignore_index=True)
 
-    splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""])
+    # directly from the langchain splitter library
+    separators = ["\n\n", "\n", " ", ""]
+    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        # cl100k_base is used for text ada 002 and later embedding models
+        encoding_name="cl100k_base",
+        chunk_size=TARGET_CHUNK_SIZE,
+        chunk_overlap=200,
+        separators=separators,
+        is_separator_regex=True,
+    )
 
     df["doc_chunks"] = df["content"].apply(lambda x: splitter.split_documents([Document(page_content=x)]))
     df = df.explode("doc_chunks", ignore_index=True)
     df["content"] = df["doc_chunks"].apply(lambda x: x.page_content)
+
+    # Remove blank doc chunks
+    df = df[~df["content"].apply(lambda x: x.isspace() or x == "")]
+
+    df["content"] = df["content"].apply(enforce_max_token_len)
     df.drop(["doc_chunks"], inplace=True, axis=1)
     df.reset_index(inplace=True, drop=True)
 
@@ -56,15 +85,36 @@ def split_python(dfs: list[pd.DataFrame]) -> pd.DataFrame:
 
     df = pd.concat(dfs, axis=0, ignore_index=True)
 
-    splitter = RecursiveCharacterTextSplitter.from_language(
-        language=Language.PYTHON,
-        # chunk_size=50,
-        chunk_overlap=0,
+    # directly from the langchain splitter library
+    python_separators = [
+        # First, try to split along class definitions
+        "\nclass ",
+        "\ndef ",
+        "\n\tdef ",
+        # Now split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+    ]
+
+    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        # cl100k_base is used for text ada 002 and later embedding models
+        encoding_name="cl100k_base",
+        chunk_size=TARGET_CHUNK_SIZE,
+        chunk_overlap=200,
+        separators=python_separators,
+        is_separator_regex=True,
     )
 
     df["doc_chunks"] = df["content"].apply(lambda x: splitter.split_documents([Document(page_content=x)]))
     df = df.explode("doc_chunks", ignore_index=True)
     df["content"] = df["doc_chunks"].apply(lambda x: x.page_content)
+
+    # Remove blank doc chunks
+    df = df[~df["content"].apply(lambda x: x.isspace() or x == "")]
+
+    df["content"] = df["content"].apply(enforce_max_token_len)
     df.drop(["doc_chunks"], inplace=True, axis=1)
     df.reset_index(inplace=True, drop=True)
 
@@ -93,9 +143,10 @@ def split_html(dfs: list[pd.DataFrame]) -> pd.DataFrame:
     splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
         # cl100k_base is used for text ada 002 and later embedding models
         encoding_name="cl100k_base",
-        chunk_size=4000,
+        chunk_size=TARGET_CHUNK_SIZE,
         chunk_overlap=200,
         separators=separators,
+        is_separator_regex=True,
     )
 
     # Split by chunking first
@@ -110,17 +161,8 @@ def split_html(dfs: list[pd.DataFrame]) -> pd.DataFrame:
     # Remove blank doc chunks
     df = df[~df["content"].apply(lambda x: x.isspace() or x == "")]
 
+    df["content"] = df["content"].apply(enforce_max_token_len)
     df.drop(["doc_chunks"], inplace=True, axis=1)
     df.reset_index(inplace=True, drop=True)
 
     return df
-
-
-def split_list(urls: list[str], chunk_size: int = 0) -> list[list[str]]:
-    """
-    split the list of string into chunk of list of string
-
-    param urls: URL list we want to chunk
-    param chunk_size: Max size of chunked list
-    """
-    return [urls[i : i + chunk_size] for i in range(0, len(urls), chunk_size)]
diff --git a/airflow/include/tasks/extract/airflow_docs.py b/airflow/include/tasks/extract/airflow_docs.py
@@ -4,12 +4,11 @@
 import urllib.parse
 
 import pandas as pd
-import requests
 from bs4 import BeautifulSoup
 from weaviate.util import generate_uuid5
 
 from airflow.decorators import task
-from include.tasks.extract.utils.html_utils import get_internal_links
+from include.tasks.extract.utils.html_utils import fetch_page_content, get_internal_links
 
 
 @task
@@ -46,7 +45,7 @@ def extract_airflow_docs(docs_base_url: str) -> list[pd.DataFrame]:
 
     df = pd.DataFrame(docs_links, columns=["docLink"])
 
-    df["html_content"] = df["docLink"].apply(lambda x: requests.get(x).content)
+    df["html_content"] = df["docLink"].apply(fetch_page_content)
 
     df["content"] = df["html_content"].apply(
         lambda x: str(BeautifulSoup(x, "html.parser").find(class_="body", role="main"))