test-ragp / src /rag_pipelines /unstructured /unstructured_chunker.py
awinml's picture
Upload 107 files
336f4a9 verified
"""Unstructured document chunking library.
This module provides the `UnstructuredChunker` class for chunking unstructured documents
using different strategies from the `unstructured` library. It supports:
- "basic" chunking: Splits documents into chunks based on character limits with
optional overlap and combining of smaller text elements.
- "by_title" chunking: Splits documents into sections based on titles, considering
options like maximum characters and allowing sections to span multiple pages.
The chunker can be configured with various parameters for each strategy, allowing
fine-grained control over the resulting chunks.
Example Usage:
```python
from dataloaders.text_splitters import UnstructuredChunker
# Initialize the chunker
chunker = UnstructuredChunker()
# Load your documents
documents = [...]
# Chunk the documents using the "basic" strategy
chunked_documents = chunker.transform_documents(documents)
"""
import logging
from typing import Any, Literal, Optional
import weave
from langchain_core.documents import Document
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element, NarrativeText
from rag_pipelines.utils import LoggerFactory
# Initialize logger
logger_factory = LoggerFactory(logger_name=__name__, log_level=logging.INFO)
logger = logger_factory.get_logger()
class UnstructuredChunker(weave.Model):
"""A class for chunking documents using different strategies provided by the unstructured library.
Supports both "basic" and "by_title" chunking strategies.
Attributes:
chunking_strategy (Literal["basic", "by_title"]): The strategy to use for chunking elements.
max_characters (int): Maximum number of characters in each chunk for the "basic" strategy.
new_after_n_chars (int): Number of characters after which a new chunk is forced for the "basic" strategy.
overlap (int): Number of characters to overlap between chunks for the "basic" strategy.
overlap_all (bool): Whether to overlap all chunks for the "basic" strategy.
combine_text_under_n_chars (Optional[int]): Maximum characters to combine smaller text elements for the "basic" strategy.
include_orig_elements (Optional[bool]): Whether to include original elements in the output for the "basic" strategy.
multipage_sections (Optional[bool]): Whether to allow sections to span multiple pages for the "by_title" strategy.
"""
chunking_strategy: Literal["basic", "by_title"]
max_characters: int
new_after_n_chars: int
overlap: int
overlap_all: bool
combine_text_under_n_chars: Optional[int]
include_orig_elements: Optional[bool]
multipage_sections: Optional[bool]
def __init__(
self,
chunking_strategy: Literal["basic", "by_title"] = "basic",
max_characters: int = 500,
new_after_n_chars: int = 500,
overlap: int = 0,
overlap_all: bool = False,
combine_text_under_n_chars: Optional[int] = None,
include_orig_elements: Optional[bool] = None,
multipage_sections: Optional[bool] = None,
):
"""Initialize the chunker with the specified chunking strategy and parameters.
Args:
chunking_strategy (Literal["basic", "by_title"], optional): Chunking strategy to use. Defaults to "basic".
max_characters (int, optional): Maximum characters in a chunk for the "basic" strategy. Defaults to 500.
new_after_n_chars (int, optional): Characters after which a new chunk is forced for the "basic" strategy.
Defaults to 500.
overlap (int, optional): Characters to overlap between chunks for the "basic" strategy. Defaults to 0.
overlap_all (bool, optional): Overlap all chunks for the "basic" strategy. Defaults to False.
combine_text_under_n_chars (Optional[int], optional): Combine text elements under this limit for the
"basic" strategy. Defaults to None.
include_orig_elements (Optional[bool], optional): Include original elements in output for the "basic"
strategy. Defaults to None.
multipage_sections (Optional[bool], optional): Allow sections to span multiple pages for the "by_title"
strategy. Defaults to None.
"""
super().__init__(
chunking_strategy=chunking_strategy,
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
combine_text_under_n_chars=combine_text_under_n_chars,
include_orig_elements=include_orig_elements,
multipage_sections=multipage_sections,
)
self.chunking_strategy = chunking_strategy
self.max_characters = max_characters
self.new_after_n_chars = new_after_n_chars
self.overlap = overlap
self.overlap_all = overlap_all
self.combine_text_under_n_chars = combine_text_under_n_chars
self.include_orig_elements = include_orig_elements
self.multipage_sections = multipage_sections
def _convert_documents_to_elements(
self, documents: list[Document]
) -> tuple[list[NarrativeText], list[dict[str, Any]]]:
"""Convert a list of LangChain documents to unstructured NarrativeText elements.
This method takes in a list of LangChain Document objects and converts each
document into a NarrativeText element. It also extracts and stores the metadata
of each document separately in a list.
Args:
documents (List[Document]): A list of LangChain Document objects to be converted to NarrativeText elements.
Returns:
tuple[list[NarrativeText], list[dict[str, Any]]]:
- A list of unstructured NarrativeText elements, where each element corresponds to a document's text.
- A list of metadata dictionaries, where each dictionary corresponds to the metadata of a document in the
input list.
"""
elements = []
element_metadatas = []
for document in documents:
# Convert each document into a NarrativeText element
element = NarrativeText(text=document.page_content)
elements.append(element)
# Store the metadata separately
element_metadatas.append(document.metadata)
logger.debug(f"Converted {len(documents)} documents to elements.")
return elements, element_metadatas
def _convert_chunked_elements_to_documents(self, elements: list[Element]) -> list[Document]:
"""Convert a list of chunked unstructured elements back to LangChain documents.
Args:
elements (List[Element]): List of chunked unstructured elements.
Returns:
List[Document]: List of LangChain documents converted from elements.
"""
documents = []
for element in elements:
document = Document(page_content=element.text, metadata=element.metadata.to_dict())
documents.append(document)
logger.debug(f"Converted {len(elements)} chunked elements to documents.")
return documents
def transform_documents(self, documents: list[Document]) -> list[Document]:
"""Chunks the provided documents based on the configured strategy.
Args:
documents (List[Document]): List of documents to be chunked.
Returns:
List[Document]: List of chunked documents.
Raises:
ValueError: If no documents are provided or if an unsupported chunking strategy is specified.
"""
if not documents:
msg = "No documents provided for transformation."
logger.error(msg)
raise ValueError(msg)
logger.info(f"Transforming {len(documents)} documents using strategy: {self.chunking_strategy}")
all_chunked_documents = []
# Convert each document to unstructured elements and separate metadata
elements, element_metadatas = self._convert_documents_to_elements(documents)
# Apply the selected chunking strategy
for i, element in enumerate(elements):
metadata = element_metadatas[i] # Get the metadata for the current document
if self.chunking_strategy == "basic":
chunked_elements = chunk_elements(
[element], # Process one element at a time
max_characters=self.max_characters,
new_after_n_chars=self.new_after_n_chars,
overlap=self.overlap,
overlap_all=self.overlap_all,
include_orig_elements=self.include_orig_elements,
)
elif self.chunking_strategy == "by_title":
chunked_elements = chunk_by_title(
[element], # Process one element at a time
max_characters=self.max_characters,
new_after_n_chars=self.new_after_n_chars,
overlap=self.overlap,
overlap_all=self.overlap_all,
include_orig_elements=self.include_orig_elements,
multipage_sections=self.multipage_sections,
combine_text_under_n_chars=self.combine_text_under_n_chars,
)
else:
msg = f"Unsupported chunking strategy: {self.chunking_strategy}"
logger.error(msg)
raise ValueError(msg)
logger.info(f"Chunked element into {len(chunked_elements)} sub-elements.")
# Add metadata to each chunk and convert back to Document
for chunk in chunked_elements:
chunked_document = Document(page_content=chunk.text, metadata=metadata)
all_chunked_documents.append(chunked_document)
logger.info(f"Combined all chunked documents into {len(all_chunked_documents)} documents.")
return all_chunked_documents