Skip to content

Commit c5c3b14

Browse files
committed
Support basic text extraction, minor speedup by caching xref portion of stream object streams
1 parent 8cab89a commit c5c3b14

File tree

7 files changed

+70
-10
lines changed

7 files changed

+70
-10
lines changed

dist/CombinePDF.xlsm

17 KB
Binary file not shown.

src/pdfLib.xlsm/Class Modules/pdfDocument.cls

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ Public Function loadPdf(ByVal pdfFilename As String) As Boolean
10251025
' root obj of PDF
10261026
Set rootCatalog = GetRootObject(content, trailer, xrefTable)
10271027
Debug.Print BytesToString(rootCatalog.serialize())
1028-
objectCache.Add rootCatalog.id, rootCatalog
1028+
'If Not objectCache.Exists(rootCatalog.ID) Then objectCache.Add rootCatalog.ID, rootCatalog
10291029

10301030
loadPdf = True ' success
10311031
Exit Function
@@ -1507,7 +1507,7 @@ Private Function GetValueType(ByRef bytes() As Byte, ByVal offset As Long, Optio
15071507

15081508
Case Else
15091509
GetValueType = PDF_ValueType.PDF_Null
1510-
Stop ' error! unexpected value
1510+
'Stop ' error! unexpected value
15111511
End Select
15121512

15131513
Exit Function

src/pdfLib.xlsm/Class Modules/pdfStream.cls

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Public stream_data As pdfValue ' PDF_StreamData
1616
' stream data may be compressed, on first access we to uncompressed data we decompress and cache results
1717
Private m_udata() As Byte
1818
Private decompressed As Boolean
19+
' first time we extract embedded reference table, but use cache'd version on future loadObject calls
20+
Public embXRefTable As Dictionary
1921

2022

2123
' returns size of data(), but warns if differs from value of meta /Length property
@@ -196,10 +198,12 @@ End Sub
196198
Public Sub Class_Initialize()
197199
Set stream_meta = New pdfValue
198200
stream_meta.valueType = PDF_ValueType.PDF_Null
201+
Set embXRefTable = Nothing ' indicates not yet loaded
199202
End Sub
200203

201204
Public Sub Class_Terminate()
202205
Set stream_meta = Nothing
203206
Set stream_data = Nothing
204207
Erase m_udata
208+
Set embXRefTable = Nothing
205209
End Sub
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
VERSION 5.00
2+
Begin {C62A69F0-16DC-11CE-9E98-00AA00574A4F} UserForm2
3+
Caption = "UserForm1"
4+
ClientHeight = 9045.001
5+
ClientLeft = 120
6+
ClientTop = 465
7+
ClientWidth = 13125
8+
OleObjectBlob = "UserForm2.frx":0000
9+
StartUpPosition = 1 'CenterOwner
10+
End
11+
Attribute VB_Name = "UserForm2"
12+
Attribute VB_GlobalNameSpace = False
13+
Attribute VB_Creatable = False
14+
Attribute VB_PredeclaredId = True
15+
Attribute VB_Exposed = False
2.52 KB
Binary file not shown.

src/pdfLib.xlsm/Modules/Tests.bas

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@ Option Explicit
44

55

66
Sub TestReadingText()
7-
Const basedir As String = "C:\Users\jeremyd\Downloads\pdf-association.pdf20examples\"
7+
Const basedir As String = "C:\Users\jeremyd\Downloads\"
8+
'Const fname As String = "pdf-association.pdf20examples\pdf20-utf8-test.pdf"
9+
Const fname As String = "weird_spaces.pdf"
810
Dim pdfDoc As pdfDocument
9-
Set pdfDoc = pdfDocument.pdfDocument(basedir & "pdf20-utf8-test.pdf")
11+
Set pdfDoc = pdfDocument.pdfDocument(basedir & fname)
1012
Dim text As String
1113
text = pdfDoc.GetDocumentText()
1214
Debug.Print text
15+
Dim uf As UserForm2: Set uf = New UserForm2
16+
uf.Label1.Caption = pdfDoc.Title & vbNewLine & text
17+
uf.Show
1318
End Sub
1419

1520
Sub TestBookmarkEditor()

src/pdfLib.xlsm/Modules/pdfParseAndGetValues.bas

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,53 @@ Public Enum PdfTextOp
1515
opT_D = 7 ' TD Move text position and set leading
1616
opTm = 8 ' Tm Set text matrix
1717
opTStar = 9 ' T* Move to start of next line
18+
19+
opTextObjectBegin ' BT
20+
opTextObjectEnd ' ET
1821
End Enum
1922

23+
24+
Type Rectangle
25+
Left As Double
26+
Bottom As Double
27+
Right As Double
28+
Top As Double
29+
End Type
30+
31+
Type FontState
32+
FontName As String
33+
FontSize As Double
34+
IsBold As Boolean
35+
IsItalic As Boolean
36+
End Type
37+
38+
Type TextState
39+
Font As FontState
40+
TextMatrix(0 To 5) As Double 'Current text matrix [a b c d e f]
41+
LineMatrix(0 To 5) As Double 'Text line matrix
42+
CTM(0 To 5) As Double 'Current transformation matrix
43+
CharSpacing As Double 'Tc
44+
WordSpacing As Double 'Tw
45+
HorizontalScaling As Double 'Tz (as percentage)
46+
Leading As Double 'TL
47+
TextRise As Double 'Ts
48+
End Type
49+
2050
'Represents a visible text run with position & styling
2151
Type TextFragment
2252
text As String
23-
x As Double 'User-space coordinates
24-
Y As Double
25-
fontName As String
26-
fontSize As Double
27-
IsBold As Boolean
28-
IsItalic As Boolean
53+
x As Double ' User-space coordinates
54+
y As Double
55+
state As TextState
56+
BoundingBox As Rectangle ' of just this chunk of text, user-space coordinates
57+
End Type
58+
59+
'Represents a chunk of text (quasi paragraph)
60+
Type TextBlock
61+
text As String
62+
BoundingBox As Rectangle ' of all Fragments
63+
StartState As TextState
64+
Fragments() As TextFragment ' Collection of TextFragment
2965
End Type
3066

3167
'Logical-structure element (see §10.7 PDF 1.7)

0 commit comments

Comments
 (0)