Skip to content

Commit 20c4922

Browse files
authored
Merge pull request #156 from ArtifexSoftware/pdf2csv-dev
Table detection & extraction support.
2 parents f87f545 + 0070d4e commit 20c4922

File tree

9 files changed

+3397
-4
lines changed

9 files changed

+3397
-4
lines changed

Demo/Program.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@ static void Main(string[] args)
99
{
1010
Document doc = new();
1111
Page page = doc.NewPage();
12-
12+
1313
MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect);
14-
writer.FillTextbox(page.Rect, "text field I like you from Poland", new Font(fontName: "Kenpixel", fontFile: "kenpixel.ttf"), rtl: true);
14+
writer.FillTextbox(page.Rect, "Hello World!", new Font(fontName: "helv"), rtl: true);
1515
writer.WriteText(page);
16-
17-
doc.Save("e:/res/test.pdf", pretty: 1);
16+
doc.Save("test.pdf", pretty: 1);
1817
}
1918
}
2019
}

MuPDF.NET.Test/TableTest.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using Microsoft.VisualStudio.TestPlatform.CommunicationUtilities.Resources;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace MuPDF.NET.Test
9+
{
10+
public class TableTest
11+
{
12+
[Test]
13+
public void BorderedTable()
14+
{
15+
Document doc = new Document("../../../resources/bordered-table.pdf");
16+
Rect clip = new Rect(20, 100, 580, 300);
17+
Page page = doc[0];
18+
int cellCount = 0;
19+
20+
List<Table> tables = page.GetTables(clip:clip);
21+
foreach (var table in tables)
22+
{
23+
List<List<string>> text = table.Extract();
24+
foreach (var row in text)
25+
{
26+
foreach (var cell in row)
27+
{
28+
cellCount++;
29+
}
30+
}
31+
}
32+
33+
doc.Close();
34+
35+
Assert.That(cellCount, Is.EqualTo(18));
36+
}
37+
38+
[Test]
39+
public void NonBorderedTable()
40+
{
41+
Document doc = new Document("../../../resources/non-bordered-table.pdf");
42+
Page page = doc[0];
43+
int cellCount = 0;
44+
45+
List<Table> tables = page.GetTables(vertical_strategy: "text", horizontal_strategy: "text");
46+
foreach (var table in tables)
47+
{
48+
List<List<string>> text = table.Extract();
49+
foreach (var row in text)
50+
{
51+
foreach (var cell in row)
52+
{
53+
cellCount++;
54+
}
55+
}
56+
}
57+
58+
doc.Close();
59+
60+
Assert.That(cellCount, Is.EqualTo(54));
61+
}
62+
}
63+
}
86.7 KB
Binary file not shown.
42.9 KB
Binary file not shown.

MuPDF.NET/Page.cs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using Microsoft.Maui;
66
using mupdf;
77
using static System.Net.Mime.MediaTypeNames;
8+
using static MuPDF.NET.Global;
89

910
namespace MuPDF.NET
1011
{
@@ -4016,6 +4017,44 @@ public List<TextBlock> GetTextBlocks(
40164017
return Utils.GetTextBlocks(this, clip, flags, textPage, sort);
40174018
}
40184019

4020+
/// <summary>
4021+
/// Return the tables on a page
4022+
/// </summary>
4023+
/// <returns> A list of the tables. Each item contains the containing list of rows constructing with cell text</returns>
4024+
public List<Table> GetTables(
4025+
Rect clip = null,
4026+
string vertical_strategy = "lines",
4027+
string horizontal_strategy = "lines",
4028+
List<Edge> vertical_lines = null,
4029+
List<Edge> horizontal_lines = null,
4030+
float snap_tolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE,
4031+
float snap_x_tolerance = 0.0f,
4032+
float snap_y_tolerance = 0.0f,
4033+
float join_tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE,
4034+
float join_x_tolerance = 0.0f,
4035+
float join_y_tolerance = 0.0f,
4036+
float edge_min_length = 3.0f,
4037+
float min_words_vertical = TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL,
4038+
float min_words_horizontal = TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL,
4039+
float intersection_tolerance = 3.0f,
4040+
float intersection_x_tolerance = 0.0f,
4041+
float intersection_y_tolerance = 0.0f,
4042+
float text_tolerance = 3.0f,
4043+
float text_x_tolerance = 3.0f,
4044+
float text_y_tolerance = 3.0f,
4045+
string strategy = null, // offer abbreviation
4046+
List<Line> add_lines = null
4047+
)
4048+
{
4049+
return Utils.GetTables(this, clip,
4050+
vertical_strategy, horizontal_strategy, vertical_lines, horizontal_lines,
4051+
snap_tolerance, snap_x_tolerance, snap_y_tolerance,
4052+
join_tolerance, join_x_tolerance, join_y_tolerance,
4053+
edge_min_length, min_words_vertical, min_words_horizontal,
4054+
intersection_tolerance, intersection_x_tolerance, intersection_y_tolerance,
4055+
text_tolerance, text_x_tolerance, text_y_tolerance, strategy, add_lines);
4056+
}
4057+
40194058
/// <summary>
40204059
/// Run page through a device.
40214060
/// </summary>

0 commit comments

Comments
 (0)