Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 160 additions & 1 deletion libs/text-splitters/langchain_text_splitters/markdown.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any, Dict, List, Tuple, TypedDict
import re
from typing import Any, Dict, List, Tuple, TypedDict, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -221,3 +222,161 @@ class HeaderType(TypedDict):
level: int
name: str
data: str


class ExperimentalMarkdownSyntaxTextSplitter:
"""
An experimental text splitter for handling Markdown syntax.

This splitter aims to retain the exact whitespace of the original text while
extracting structured metadata, such as headers. It is a re-implementation of the
MarkdownHeaderTextSplitter with notable changes to the approach and
additional features.

Key Features:
- Retains the original whitespace and formatting of the Markdown text.
- Extracts headers, code blocks, and horizontal rules as metadata.
- Splits out code blocks and includes the language in the "Code" metadata key.
- Splits text on horizontal rules (`---`) as well.
- Defaults to sensible splitting behavior, which can be overridden using the
`headers_to_split_on` parameter.

Parameters:
----------
headers_to_split_on : List[Tuple[str, str]], optional
Headers to split on, defaulting to common Markdown headers if not specified.
return_each_line : bool, optional
When set to True, returns each line as a separate chunk. Default is False.

Usage example:
--------------
>>> headers_to_split_on = [
>>> ("#", "Header 1"),
>>> ("##", "Header 2"),
>>> ]
>>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
>>> headers_to_split_on=headers_to_split_on
>>> )
>>> chunks = splitter.split(text)
>>> for chunk in chunks:
>>> print(chunk)

This class is currently experimental and subject to change based on feedback and
further development.
"""

DEFAULT_HEADER_KEYS = {
"#": "Header 1",
"##": "Header 2",
"###": "Header 3",
"####": "Header 4",
"#####": "Header 5",
"######": "Header 6",
}

def __init__(
self,
headers_to_split_on: Union[List[Tuple[str, str]], None] = None,
return_each_line: bool = False,
strip_headers: bool = True,
):
self.chunks: List[Document] = []
self.current_chunk = Document(page_content="")
self.current_header_stack: List[Tuple[int, str]] = []
self.strip_headers = strip_headers
if headers_to_split_on:
self.splittable_headers = dict(headers_to_split_on)
else:
self.splittable_headers = self.DEFAULT_HEADER_KEYS

self.return_each_line = return_each_line

def split_text(self, text: str) -> List[Document]:
raw_lines = text.splitlines(keepends=True)

while raw_lines:
raw_line = raw_lines.pop(0)
header_match = self._match_header(raw_line)
code_match = self._match_code(raw_line)
horz_match = self._match_horz(raw_line)
if header_match:
self._complete_chunk_doc()

if not self.strip_headers:
self.current_chunk.page_content += raw_line

# add the header to the stack
header_depth = len(header_match.group(1))
header_text = header_match.group(2)
self._resolve_header_stack(header_depth, header_text)
elif code_match:
self._complete_chunk_doc()
self.current_chunk.page_content = self._resolve_code_chunk(
raw_line, raw_lines
)
self.current_chunk.metadata["Code"] = code_match.group(1)
self._complete_chunk_doc()
elif horz_match:
self._complete_chunk_doc()
else:
self.current_chunk.page_content += raw_line

self._complete_chunk_doc()
# I don't see why `return_each_line` is a necessary feature of this splitter.
# It's easy enough to to do outside of the class and the caller can have more
# control over it.
if self.return_each_line:
return [
Document(page_content=line, metadata=chunk.metadata)
for chunk in self.chunks
for line in chunk.page_content.splitlines()
if line and not line.isspace()
]
return self.chunks

def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
for i, (depth, _) in enumerate(self.current_header_stack):
if depth == header_depth:
self.current_header_stack[i] = (header_depth, header_text)
self.current_header_stack = self.current_header_stack[: i + 1]
return
self.current_header_stack.append((header_depth, header_text))

def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
chunk = current_line
while raw_lines:
raw_line = raw_lines.pop(0)
chunk += raw_line
if self._match_code(raw_line):
return chunk
return ""

def _complete_chunk_doc(self) -> None:
chunk_content = self.current_chunk.page_content
# Discard any empty documents
if chunk_content and not chunk_content.isspace():
# Apply the header stack as metadata
for depth, value in self.current_header_stack:
header_key = self.splittable_headers.get("#" * depth)
self.current_chunk.metadata[header_key] = value
self.chunks.append(self.current_chunk)
# Reset the current chunk
self.current_chunk = Document(page_content="")

# Match methods
def _match_header(self, line: str) -> Union[re.Match, None]:
match = re.match(r"^(#{1,6}) (.*)", line)
# Only matches on the configured headers
if match and match.group(1) in self.splittable_headers:
return match
return None

def _match_code(self, line: str) -> Union[re.Match, None]:
matches = [re.match(rule, line) for rule in [r"^```(.*)", r"^~~~(.*)"]]
return next((match for match in matches if match), None)

def _match_horz(self, line: str) -> Union[re.Match, None]:
matches = [
re.match(rule, line) for rule in [r"^\*\*\*+\n", r"^---+\n", r"^___+\n"]
]
return next((match for match in matches if match), None)
209 changes: 208 additions & 1 deletion libs/text-splitters/tests/unit_tests/test_text_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_text_splitters.html import HTMLHeaderTextSplitter, HTMLSectionSplitter
from langchain_text_splitters.json import RecursiveJsonSplitter
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.markdown import (
ExperimentalMarkdownSyntaxTextSplitter,
MarkdownHeaderTextSplitter,
)
from langchain_text_splitters.python import PythonCodeTextSplitter

FAKE_PYTHON_TEXT = """
Expand Down Expand Up @@ -1296,6 +1299,210 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
assert output == expected_output


EXPERIMENTAL_MARKDOWN_DOCUMENT = (
"# My Header 1\n"
"Content for header 1\n"
"## Header 2\n"
"Content for header 2\n"
"```python\n"
"def func_definition():\n"
" print('Keep the whitespace consistent')\n"
"```\n"
"# Header 1 again\n"
"We should also split on the horizontal line\n"
"----\n"
"This will be a new doc but with the same header metadata\n\n"
"And it includes a new paragraph"
)


def test_experimental_markdown_syntax_text_splitter() -> None:
"""Test experimental markdown syntax splitter."""

markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)

expected_output = [
Document(
page_content="Content for header 1\n",
metadata={"Header 1": "My Header 1"},
),
Document(
page_content="Content for header 2\n",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content=(
"```python\ndef func_definition():\n "
"print('Keep the whitespace consistent')\n```\n"
),
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content="We should also split on the horizontal line\n",
metadata={"Header 1": "Header 1 again"},
),
Document(
page_content=(
"This will be a new doc but with the same header metadata\n\n"
"And it includes a new paragraph"
),
metadata={"Header 1": "Header 1 again"},
),
]

assert output == expected_output


def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:
"""Test experimental markdown syntax splitter."""

headers_to_split_on = [("#", "Encabezamiento 1")]

markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
headers_to_split_on=headers_to_split_on
)
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)

expected_output = [
Document(
page_content="Content for header 1\n## Header 2\nContent for header 2\n",
metadata={"Encabezamiento 1": "My Header 1"},
),
Document(
page_content=(
"```python\ndef func_definition():\n "
"print('Keep the whitespace consistent')\n```\n"
),
metadata={"Code": "python", "Encabezamiento 1": "My Header 1"},
),
Document(
page_content="We should also split on the horizontal line\n",
metadata={"Encabezamiento 1": "Header 1 again"},
),
Document(
page_content=(
"This will be a new doc but with the same header metadata\n\n"
"And it includes a new paragraph"
),
metadata={"Encabezamiento 1": "Header 1 again"},
),
]

assert output == expected_output


def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
"""Test experimental markdown syntax splitter."""

markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)

expected_output = [
Document(
page_content="# My Header 1\nContent for header 1\n",
metadata={"Header 1": "My Header 1"},
),
Document(
page_content="## Header 2\nContent for header 2\n",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content=(
"```python\ndef func_definition():\n "
"print('Keep the whitespace consistent')\n```\n"
),
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content=(
"# Header 1 again\nWe should also split on the horizontal line\n"
),
metadata={"Header 1": "Header 1 again"},
),
Document(
page_content=(
"This will be a new doc but with the same header metadata\n\n"
"And it includes a new paragraph"
),
metadata={"Header 1": "Header 1 again"},
),
]

assert output == expected_output


def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
"""Test experimental markdown syntax splitter."""

markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)

expected_output = [
Document(
page_content="Content for header 1", metadata={"Header 1": "My Header 1"}
),
Document(
page_content="Content for header 2",
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
),
Document(
page_content="```python",
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content="def func_definition():",
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content=" print('Keep the whitespace consistent')",
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content="```",
metadata={
"Code": "python",
"Header 1": "My Header 1",
"Header 2": "Header 2",
},
),
Document(
page_content="We should also split on the horizontal line",
metadata={"Header 1": "Header 1 again"},
),
Document(
page_content="This will be a new doc but with the same header metadata",
metadata={"Header 1": "Header 1 again"},
),
Document(
page_content="And it includes a new paragraph",
metadata={"Header 1": "Header 1 again"},
),
]

assert output == expected_output


def test_solidity_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0
Expand Down
Loading