microsoft · dayesouza · Jan 21, 2025 · Jan 21, 2025 · Jan 22, 2025
@@ -58,7 +58,7 @@ jobs:
       GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4
       GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4
       GRAPHRAG_CHUNK_SIZE: 1200
-      GRAPHRAG_CHUNK_OVERLAP: 0
+      GRAPHRAG_CHUNK_OVERLAP: 100
       # Azure AI Search config
       AZURE_AI_SEARCH_URL_ENDPOINT: ${{ secrets.AZURE_AI_SEARCH_URL_ENDPOINT }}
       AZURE_AI_SEARCH_API_KEY: ${{ secrets.AZURE_AI_SEARCH_API_KEY }}

@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "fix chunk overlap bug that wasn't considering a 0 value"
+}
@@ -411,9 +411,12 @@ def hydrate_parallelization_params(
                 reader.str(Fragment.encoding_model) or global_encoding_model
             )
             strategy = reader.str("strategy")
+            overlap = reader.int("overlap")
+            if overlap is None:
+                overlap = defs.CHUNK_OVERLAP
             chunks_model = ChunkingConfig(
                 size=reader.int("size") or defs.CHUNK_SIZE,
-                overlap=reader.int("overlap") or defs.CHUNK_OVERLAP,
+                overlap=overlap,
                 group_by_columns=group_by_columns,
                 encoding_model=encoding_model,
                 strategy=ChunkStrategyType(strategy)

@@ -351,6 +351,33 @@ def test_can_set_no_chunk_by_columns(self):
         parameters = create_graphrag_config()
         assert parameters.chunks.group_by_columns == []
 
+    @mock.patch.dict(
+        os.environ,
+        {"GRAPHRAG_CHUNK_OVERLAP": "0", "GRAPHRAG_API_KEY": "test"},
+        clear=True,
+    )
+    def test_can_set_chunk_zero_overlap(self):
+        parameters = create_graphrag_config()
+        assert parameters.chunks.overlap == 0
+
+    @mock.patch.dict(
+        os.environ,
+        {"GRAPHRAG_API_KEY": "test"},
+        clear=True,
+    )
+    def test_can_set_chunk_none_overlap(self):
+        parameters = create_graphrag_config()
+        assert parameters.chunks.overlap == 100
+
+    @mock.patch.dict(
+        os.environ,
+        {"GRAPHRAG_CHUNK_OVERLAP": "42", "GRAPHRAG_API_KEY": "test"},
+        clear=True,
+    )
+    def test_can_set_chunk_value_overlap(self):
+        parameters = create_graphrag_config()
+        assert parameters.chunks.overlap == 42
+
     def test_all_env_vars_is_accurate(self):
         env_var_docs_path = Path("docs/config/env_vars.md")
 
@@ -533,6 +560,9 @@ def test_yaml_load_e2e():
   requests_per_minute: 900
   thread_count: 50
   concurrent_requests: 25
+
+chunks:
+  overlap: 0
 """
     )
     # create default configuration pipeline parameters from the custom settings
@@ -544,6 +574,7 @@ def test_yaml_load_e2e():
     assert parameters.llm.api_base == "http://test"
     assert parameters.llm.api_version == "v1"
     assert parameters.llm.deployment_name == "test"
+    assert parameters.chunks.overlap == 0
 
     # generate the pipeline from the default parameters
     pipeline_config = create_pipeline_config(parameters, True)