Skip to content

Commit 9a8d0e2

Browse files
committed
add caching to uncompressed object stream data due to slowness of VBA decompression
1 parent 8dfdb2e commit 9a8d0e2

File tree

7 files changed

+245
-693
lines changed

7 files changed

+245
-693
lines changed

dist/CombinePDF.xlsm

-3.54 KB
Binary file not shown.

src/Tests.bas

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ Sub TestHeaderAndVersion()
1919
Debug.Print "[" & pdfDoc.Header & "]"
2020
End Sub
2121

22-
2322
Sub TestProblemPdfs()
2423
On Error GoTo errHandler
2524
Const basedir As String = "C:\Users\jeremyd\Downloads\"

src/inflate_rfc1951.bas

Lines changed: 0 additions & 647 deletions
This file was deleted.

src/pdfDocument.cls

Lines changed: 71 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ Private m_Pages As pdfValue
3232
' cache of loaded/shared objects
3333
Public objectCache As Dictionary ' of pdfValue key'd by object id
3434

35+
' cache only used when loading embedded objects from stream object streams, caches the stream object
36+
Private sosCache As Dictionary
37+
3538
' the full contents of raw PDF document
3639
Private content() As Byte
3740

@@ -129,20 +132,16 @@ Public Property Get Info() As pdfValue
129132
' note: we Set m_Info instead of Me.Info as we are retrieving /Info id from trailer's reference object already
130133

131134
' get reference to /Info object (object itself should not be stored directly in trailer)
135+
' if no /Info key or it refers to null, then returns pdf null value
136+
' (if /Info stored directly as object, not valid, but we will return that anyway)
132137
Dim infoRefObj As pdfValue: Set infoRefObj = GetInfo(trailer)
133-
Dim infoId As Long: infoId = -1
134138
If infoRefObj.valueType = PDF_ValueType.PDF_Reference Then
135-
infoId = CLng(infoRefObj.Value)
139+
Set m_Info = getObject(CLng(infoRefObj.Value), CLng(infoRefObj.generation), cacheObject:=False)
136140
'ElseIf info.valueType = PDF_ValueType.PDF_Object Then
141+
'ElseIf info.valueType = PDF_ValueType.PDF_Null Then
142+
'Else error, unexpected type
137143
End If
138144

139-
If objectCache.Exists(infoId) Then
140-
Set m_Info = objectCache.Item(infoId)
141-
Else
142-
'Set m_Info = GetInfoObject(content, trailer, xrefTable)
143-
Set m_Info = getObject(content, xrefTable, infoId)
144-
'objectCache.Add m_Info.id, m_Info
145-
End If
146145
Debug.Print BytesToString(m_Info.serialize())
147146
End If
148147

@@ -203,13 +202,8 @@ Public Property Get Meta() As pdfValue
203202
Dim pdfRefObj As pdfValue
204203
Set pdfRefObj = rootCatalog.asDictionary.Item("/Metadata")
205204
' 1st try loading cached object, if that fails, try loading from raw content()
206-
If objectCache.Exists(CLng(pdfRefObj.Value)) Then
207-
Set m_Meta = objectCache.Item(CLng(pdfRefObj.Value))
208-
Else
209-
Set m_Meta = getObject(content, xrefTable, pdfRefObj.Value)
210-
End If
205+
Set m_Meta = getObject(CLng(pdfRefObj.Value), pdfRefObj.generation, cacheObject:=False)
211206
Debug.Print BytesToString(m_Meta.serialize())
212-
'objectCache.Add m_Meta.id, m_Meta
213207
Else
214208
Debug.Print "No /Metadata object specified in root /Catalog"
215209
End If
@@ -237,7 +231,7 @@ Public Property Get pages() As pdfValue
237231
' obj should now be a reference to our /Pages object
238232
If obj.valueType = PDF_ValueType.PDF_Reference Then
239233
' retrieve object (not reference) matching given obj reference's (id, generation) pair
240-
Set m_Pages = getCachedObject(obj.Value, obj.generation)
234+
Set m_Pages = getObject(CLng(obj.Value), obj.generation)
241235
ElseIf obj.valueType = PDF_ValueType.PDF_Object Then
242236
' ok, weird but whatever
243237
Set m_Pages = obj
@@ -257,8 +251,65 @@ End Property
257251

258252

259253
' returns matching object from PDF
260-
' will first try to find in objectCache, then uses cross reference table to load from content,
254+
' will first try to find in objectCache, then uses cross reference table to load from content(),
261255
' otherwise returns pdf null object
256+
' Note: unless cacheObject is set to False, if successfully extracted object from content() then will add to object cache
257+
Public Function getObject(ByVal id As Long, ByVal generation As Long, Optional ByVal cacheObject As Boolean = True) As pdfValue
258+
Dim obj As pdfValue
259+
' first try loading from our cache
260+
Set obj = getCachedObject(id, generation)
261+
' if it wasn't there then try loading from content()
262+
If obj.valueType = PDF_ValueType.PDF_Null Then
263+
Set obj = pdfParseAndGetValues.getObject(content, xrefTable, id, sosCache)
264+
If obj Is Nothing Then Set obj = New pdfValue
265+
266+
' if successfully loaded, update our object cache
267+
If cacheObject And (obj.valueType <> PDF_ValueType.PDF_Null) Then
268+
If objectCache.Exists(id) Then
269+
' we need to determine if same generation, presumably not or we would have retrieved it above
270+
Dim dict As Dictionary
271+
Dim curObj As pdfValue
272+
Dim v As Variant
273+
Set v = objectCache.Item(id)
274+
If TypeName(v) = "pdfValue" Then
275+
Set curObj = objectCache.Item(id)
276+
If curObj.generation <> generation Then
277+
Set dict = New Dictionary
278+
dict.Add curObj.generation, curObj
279+
dict.Add generation, obj
280+
Set objectCache(id) = dict
281+
Set dict = Nothing
282+
Else
283+
Debug.Print "Internal Error! - id & generation the same but not?"
284+
Stop
285+
End If
286+
Set curObj = Nothing
287+
ElseIf TypeName(v) = "Dictionary" Then
288+
' assume may contain multiple generations, key'd by generation
289+
Set dict = v
290+
If dict.Exists(generation) Then
291+
Set dict(generation) = obj
292+
Else
293+
dict.Add generation, obj
294+
End If
295+
Set dict = Nothing
296+
Else
297+
Debug.Print "Internal Error!"
298+
Stop
299+
End If
300+
Else
301+
objectCache.Add id, obj
302+
End If
303+
End If
304+
End If
305+
306+
Set getObject = obj
307+
End Function
308+
309+
310+
' returns matching object from PDF
311+
' will first try to find in objectCache, if not found in cache then returns pdf null object
312+
' Does not attempt to load from raw pdf in content() via cross reference table
262313
Public Function getCachedObject(ByVal id As Long, ByVal generation As Long) As pdfValue
263314
Dim v As Variant
264315
' return object, but verify matches expected generation
@@ -366,7 +417,7 @@ End Function
366417
' will fail if loadPDF has not been called or content() array is not prefilled by other means
367418
' True on success, false on any error
368419
Public Function parsePdf() As Boolean
369-
GetObjectsInTree rootCatalog, content, xrefTable, objectCache
420+
GetObjectsInTree rootCatalog, content, xrefTable, objectCache, sosCache
370421

371422
' /Info is in trailer, so won't be loaded as part of rootCatalog,
372423
' Note first call to Info actually retreives, so need to: Set Info = GetInfoObject(content, trailer, xrefTable)
@@ -633,6 +684,7 @@ End Sub
633684
Private Sub Class_Initialize()
634685
On Error GoTo errHandler
635686
Set objectCache = New Dictionary
687+
Set sosCache = New Dictionary
636688
Set xrefTable = NewXrefTable()
637689
Set rootCatalog = New pdfValue
638690
Set trailer = NewTrailer()
@@ -648,6 +700,7 @@ End Sub
648700
Private Sub Class_Terminate()
649701
On Error GoTo errHandler
650702
Set objectCache = Nothing
703+
Set sosCache = Nothing
651704
Set xrefTable = Nothing
652705
Set rootCatalog = Nothing
653706
Set trailer = Nothing

src/pdfParseAndGetValues.bas

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,9 @@ Function GetValueType(ByRef bytes() As Byte, ByVal offset As Long) As PDF_ValueT
9696
Case "%"
9797
GetValueType = PDF_ValueType.PDF_Comment
9898

99-
Case "e", ">" ', "]"
99+
Case "e", ">" '
100100
tmpStr = GetWord(bytes, offset)
101101
Select Case LCase(tmpStr)
102-
' Case "]"
103-
' GetValueType = PDF_ValueType.PDF_EndOfArray
104102
Case ">>"
105103
GetValueType = PDF_ValueType.PDF_EndOfDictionary
106104
Case "endstream"
@@ -451,8 +449,7 @@ Function GetDictionaryValue(ByRef Value As pdfValue, ByVal name As String) As pd
451449
If Value.Value.Exists(name) Then
452450
Set GetDictionaryValue = Value.Value.Item(name)
453451
Else
454-
Set GetDictionaryValue = New pdfValue
455-
GetDictionaryValue.valueType = PDF_ValueType.PDF_Null
452+
Set GetDictionaryValue = New pdfValue ' defaults to PDF_ValueType.PDF_Null
456453
End If
457454
Exit Function
458455
errHandler:
@@ -640,6 +637,9 @@ Function ParseXrefTable(ByRef content() As Byte, ByRef offset As Long, ByRef tra
640637

641638
' we need the uncompressed (un-/Filter'd) data
642639
Dim rawData() As Byte
640+
#If True Then
641+
rawData = objStream.udata
642+
#Else
643643
If objStream.Meta.Exists("/Filter") Then
644644
' TODO support all /Filter types
645645
Dim filter As String
@@ -753,6 +753,7 @@ Function ParseXrefTable(ByRef content() As Byte, ByRef offset As Long, ByRef tra
753753
Else
754754
rawData = objStream.data
755755
End If
756+
#End If
756757

757758
Dim objOffset As Long
758759
objOffset = 0
@@ -849,7 +850,10 @@ errHandler:
849850
End Function
850851

851852

852-
Function getObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal Index As Long) As pdfValue
853+
' extracts/parses pdf object from raw pdf content()
854+
' due to potential slowness uncompressing in VBA, stream object streams should be cached
855+
' the sosCache is only used for stream object streams and only if provided
856+
Function getObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal Index As Long, ByRef sosCache As Dictionary) As pdfValue
853857
On Error GoTo errHandler
854858
Dim obj As pdfValue
855859
If xrefTable.Exists(Index) Then
@@ -862,17 +866,26 @@ Function getObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal
862866
Dim cntrObjEntry As xrefEntry
863867
Set cntrObjEntry = xrefTable.Item(entry.embedObjId)
864868
Dim cntrObj As pdfValue
865-
Set cntrObj = getObject(content, xrefTable, entry.embedObjId)
866-
Dim dict As Dictionary
867-
Set dict = cntrObj.Value.Value.Meta
869+
' try loading containing object (stream object stream) from cache before potentially uncompressing
870+
If Not sosCache Is Nothing Then
871+
If sosCache.Exists(entry.embedObjId) Then Set cntrObj = sosCache(entry.embedObjId)
872+
End If
873+
If cntrObj Is Nothing Then ' not in cache or no cache provided
874+
Set cntrObj = getObject(content, xrefTable, entry.embedObjId, sosCache)
875+
If Not sosCache Is Nothing Then Set sosCache(entry.embedObjId) = cntrObj ' add/update cache
876+
End If
868877

869878
' extract our embedded object
870-
Dim cbuf() As Byte, buffer() As Byte
871-
Dim inOff As Long, outSize As Long, estBufSize As Long
872-
cbuf = cntrObj.Value.Value.data
873-
inOff = 2
874-
If dict.Exists("/DL") Then estBufSize = CLng(dict.Item("/DL").Value) ' only a hint
875-
If inflate2(cbuf, buffer, inOff, outSize, estBufSize) Then
879+
If cntrObj.Value.valueType <> PDF_ValueType.PDF_Stream Then
880+
Debug.Print "Error! expecting stream object stream!"
881+
Stop
882+
GoTo nullValue
883+
End If
884+
Dim streamObjectStream As pdfStream
885+
Set streamObjectStream = cntrObj.Value.Value
886+
Dim buffer() As Byte
887+
buffer = streamObjectStream.udata
888+
If (UBound(buffer) - LBound(buffer)) > 0 Then
876889
' parse embedded object data
877890
' buffer has N sets of obj id# <whitespace> offset
878891
' immediately followed by objects' data, note: /First
@@ -883,6 +896,8 @@ Function getObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal
883896
Dim embOffset As Long
884897
Dim i As Long
885898
Dim firstOffset As Long
899+
Dim dict As Dictionary
900+
Set dict = streamObjectStream.Meta
886901
If dict.Exists("/First") Then
887902
firstOffset = CLng(dict.Item("/First").Value)
888903
Else
@@ -927,7 +942,7 @@ Function getObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal
927942
obj.valueType = PDF_ValueType.PDF_Object
928943
Set obj.Value = GetValue(buffer, embOffset)
929944
Else
930-
Debug.Print "Error inflating embedded object!"
945+
Debug.Print "Error reading embedded object!"
931946
Stop
932947
End If
933948
Else
@@ -956,7 +971,7 @@ Function GetRootObject(ByRef content() As Byte, ByRef trailer As pdfValue, ByRef
956971
' get either reference or /Root object itself
957972
Set root = GetRoot(trailer)
958973
If root.valueType = PDF_ValueType.PDF_Reference Then
959-
Set root = getObject(content, xrefTable, root.Value)
974+
Set root = getObject(content, xrefTable, root.Value, Nothing)
960975
'ElseIf root.valueType = PDF_ValueType.PDF_Object Then
961976
End If
962977

@@ -976,7 +991,7 @@ Function GetInfoObject(ByRef content() As Byte, ByRef trailer As pdfValue, ByRef
976991
' get either reference or /Info object itself
977992
Set Info = GetInfo(trailer)
978993
If Info.valueType = PDF_ValueType.PDF_Reference Then
979-
Set Info = getObject(content, xrefTable, Info.Value)
994+
Set Info = getObject(content, xrefTable, Info.Value, Nothing)
980995
'ElseIf info.valueType = PDF_ValueType.PDF_Object Then
981996
End If
982997

@@ -991,7 +1006,7 @@ End Function
9911006

9921007

9931008
' updates objects Dictionary with all objects under root node, indexed by object id, i.e. loads a chunk of the PDF document
994-
Sub GetObjectsInTree(ByRef root As pdfValue, ByRef content() As Byte, ByRef xrefTable As Dictionary, ByRef objects As Dictionary)
1009+
Sub GetObjectsInTree(ByRef root As pdfValue, ByRef content() As Byte, ByRef xrefTable As Dictionary, ByRef objects As Dictionary, ByRef sosCache As Dictionary)
9951010
On Error GoTo errHandler
9961011
Dim obj As pdfValue
9971012
Dim v As Variant
@@ -1003,27 +1018,27 @@ Sub GetObjectsInTree(ByRef root As pdfValue, ByRef content() As Byte, ByRef xref
10031018
Case PDF_ValueType.PDF_Array
10041019
For Each v In root.Value
10051020
Set obj = v
1006-
GetObjectsInTree obj, content, xrefTable, objects
1021+
GetObjectsInTree obj, content, xrefTable, objects, sosCache
10071022
Next v
10081023
Case PDF_ValueType.PDF_Dictionary
10091024
For Each v In root.Value.Items
10101025
Set obj = v
1011-
GetObjectsInTree obj, content, xrefTable, objects
1026+
GetObjectsInTree obj, content, xrefTable, objects, sosCache
10121027
Next v
10131028
Case PDF_ValueType.PDF_Object
1014-
GetObjectsInTree root.Value, content, xrefTable, objects
1029+
GetObjectsInTree root.Value, content, xrefTable, objects, sosCache
10151030
Case PDF_ValueType.PDF_Reference
10161031
' we need to load object
10171032
If Not objects.Exists(CLng(root.Value)) Then
1018-
Set obj = getObject(content, xrefTable, root.Value)
1033+
Set obj = getObject(content, xrefTable, root.Value, sosCache)
10191034
objects.Add CLng(root.Value), obj
1020-
GetObjectsInTree obj, content, xrefTable, objects
1035+
GetObjectsInTree obj, content, xrefTable, objects, sosCache
10211036
End If
10221037
Case PDF_ValueType.PDF_Stream
10231038
Dim stream As pdfStream
10241039
Set stream = root.Value
1025-
GetObjectsInTree stream.stream_meta, content, xrefTable, objects
1026-
GetObjectsInTree stream.stream_data, content, xrefTable, objects
1040+
GetObjectsInTree stream.stream_meta, content, xrefTable, objects, sosCache
1041+
GetObjectsInTree stream.stream_data, content, xrefTable, objects, sosCache
10271042
Case PDF_ValueType.PDF_StreamData
10281043
' Nothing to do
10291044
Case PDF_ValueType.PDF_Trailer

0 commit comments

Comments
 (0)