Skip to content

Commit 96c5575

Browse files
committed
fix bugs in renumbering, combine back to basic functionality but ready for proper outline support
1 parent 651e132 commit 96c5575

File tree

7 files changed

+111
-69
lines changed

7 files changed

+111
-69
lines changed

dist/CombinePDF.xlsm

-19.4 KB
Binary file not shown.

src/pdfLib.xlsm/Class Modules/pdfDocument.cls

Lines changed: 80 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -392,48 +392,97 @@ End Sub
392392

393393

394394
' updates id of all objects under root pdfValue beginning at baseId
395-
' Note: use trailer as root to renumber whole document
395+
' Note: if root is Nothing then will update whole document including caches
396396
' Warning: assumes each obj id refers to same obj and objReference instances
397-
Public Function renumberIds(ByRef root As pdfValue, ByVal baseId As Long, Optional ByRef visited As Dictionary = Nothing) As Long
397+
Public Function renumberIds(ByVal baseId As Long, Optional ByRef root As pdfValue = Nothing, Optional ByRef visited As Dictionary = Nothing) As Long
398398
On Error GoTo errHandler
399-
If visited Is Nothing Then Set visited = New Dictionary
400399
Dim v As Variant
401400
Dim obj As pdfValue
402-
' sanity check
403-
If root Is Nothing Then Exit Function
401+
Dim fixCache As Boolean
402+
403+
' renumber whole tree?
404+
If root Is Nothing Then
405+
fixCache = True
406+
Set root = trailer
407+
End If
408+
409+
' should be Nothing on first call, we use this to avoid recursion and thus renumber already renumbered objs
410+
If visited Is Nothing Then
411+
Set visited = New Dictionary
412+
End If
413+
404414
' don't renumber same object more than once
405415
If root.id <> 0 Then
406-
If visited.Exists(root.id) Then
416+
If visited.Exists(root) Then
407417
renumberIds = baseId
408418
Exit Function
409419
End If
410420
' and actually do the renumbering
411421
root.id = baseId
412422
baseId = baseId + 1
413-
visited.Add root.id, root.id
423+
' Note: we use actual object and not id here as temp there may be multiple objs with same id
424+
' e.g. obj 1 renumbered to 4, and there is still an object 4 not yet renumbered
425+
visited.Add root, root
414426
End If
415427
' update any children
416428
Select Case root.valueType
417429
Case PDF_ValueType.PDF_Array
418430
For Each v In root.value
419431
Set obj = v
420-
baseId = renumberIds(obj, baseId, visited)
432+
baseId = renumberIds(baseId, obj, visited)
421433
Next v
422434
Case PDF_ValueType.PDF_Dictionary
423435
For Each v In root.value.Items
424436
Set obj = v
425-
baseId = renumberIds(obj, baseId, visited)
437+
baseId = renumberIds(baseId, obj, visited)
426438
Next v
427439
Case PDF_ValueType.PDF_Reference
428440
Set obj = getObject(root.value, 0)
429-
baseId = renumberIds(obj, baseId, visited)
441+
baseId = renumberIds(baseId, obj, visited)
430442
Case PDF_ValueType.PDF_Trailer, PDF_ValueType.PDF_Object
431-
baseId = renumberIds(root.value, baseId, visited)
443+
baseId = renumberIds(baseId, root.value, visited)
432444
Case Else
433445
' no nested objects that need id's updated
434446
End Select
435447
' last id used, or same as provided if nothing updated
436448
renumberIds = baseId
449+
450+
' because we renumbered everything, we need to invalidate or rebuild our cache's
451+
' but only from top level call
452+
' note: because we have renumbered, we also invalidate our content buffer as it refers to old ids
453+
If fixCache Then
454+
'Set visited = Nothing ' this is ByRef so only set to Nothing if originally Nothing
455+
456+
If ByteArraySize(content) >= 0 Then Erase content
457+
Dim newDict As Dictionary
458+
If Not objectCache Is Nothing Then
459+
Set newDict = New Dictionary
460+
For Each v In objectCache.Items
461+
Set obj = v
462+
If obj.id <> 0 Then Set newDict(obj.id) = obj
463+
Next v
464+
Set objectCache = newDict
465+
Set newDict = Nothing
466+
End If
467+
468+
If Not refCache Is Nothing Then
469+
Set newDict = New Dictionary
470+
For Each v In refCache.Items
471+
Set obj = v
472+
With obj
473+
If .id <> 0 Then
474+
Set newDict(.id & "_" & .generation) = obj
475+
Set newDict(.id & " " & .generation & " R") = obj.referenceObj
476+
End If
477+
End With
478+
Next v
479+
Set refCache = newDict
480+
Set newDict = Nothing
481+
End If
482+
483+
Set sosCache = Nothing
484+
End If
485+
437486
Exit Function
438487
errHandler:
439488
Debug.Print "Error: " & Err.Description & " (" & Err.Number & ")"
@@ -453,7 +502,7 @@ Public Function getObject(ByVal id As Long, ByVal generation As Long, Optional B
453502
Set obj = getCachedObject(id, generation)
454503
' if it wasn't there then try loading from content()
455504
If obj.valueType = PDF_ValueType.PDF_Null Then
456-
Set obj = loadObject(content, xrefTable, id)
505+
Set obj = loadObject(id)
457506
If obj Is Nothing Then Set obj = New pdfValue
458507

459508
' if successfully loaded, update our object cache
@@ -1325,7 +1374,6 @@ Function GetValue(ByRef bytes() As Byte, ByRef offset As Long, Optional ByRef Me
13251374
Case PDF_ValueType.PDF_Reference, PDF_ValueType.PDF_Object
13261375
Dim words(0 To 2) As String
13271376
words(0) = GetWord(bytes, offset)
1328-
If IsMatch(words(0), "3") Then Stop
13291377
offset = SkipWhiteSpace(bytes, offset)
13301378
words(1) = GetWord(bytes, offset)
13311379
offset = SkipWhiteSpace(bytes, offset)
@@ -1446,19 +1494,33 @@ End Function
14461494

14471495

14481496
' extracts/parses pdf object from raw pdf content()
1497+
' uses xrefTable to determine offset directly or obtain using cross reference stream
14491498
' due to potential slowness uncompressing in VBA, stream object streams should be cached
14501499
' the sosCache is only used for stream object streams and only if provided
14511500
' the refCache is to ensure only 1 indirect reference object instance is created for each object
14521501
' this will use provided VBA reference object instance instead of creating a new [different] one
1453-
Function loadObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVal Index As Long) As pdfValue
1502+
' forceReload bypasses refCache and returns a new VBA object instance ***
1503+
' *** changes to this object's id may not effect any other object nor their reference values
1504+
' *** if object previously loaded returns a new instance, otherwise results same as if is False
1505+
' *** possibly useful to duplicate an object to be mutated and added, but use with care
1506+
Function loadObject(ByVal Index As Long, Optional forceReload As Boolean = False) As pdfValue
14541507
On Error GoTo errHandler
14551508
Dim obj As pdfValue
1509+
1510+
' speed up, ensure single object returned, and avoid issues when objects are removed from current objectCache
1511+
If (Not forceReload) And (Not refCache Is Nothing) Then
1512+
If refCache.Exists(Index) Then
1513+
Set loadObject = refCache(Index)
1514+
Exit Function
1515+
End If
1516+
End If
1517+
14561518
If xrefTable.Exists(Index) Then
14571519
Dim entry As xrefEntry
14581520
Set entry = xrefTable.Item(Index)
14591521
If entry.isFree Or ((Not entry.isEmbeded) And (entry.offset <= 0)) Then GoTo NullValue
14601522
Dim offset As Long: offset = entry.offset
1461-
If offset > UBound(content) Then GoTo NullValue
1523+
If offset > ByteArraySize(content) Then GoTo NullValue
14621524
If entry.isEmbeded Then
14631525
Dim cntrObjEntry As xrefEntry
14641526
Set cntrObjEntry = xrefTable.Item(entry.embedObjId)
@@ -1468,7 +1530,7 @@ Function loadObject(ByRef content() As Byte, ByRef xrefTable As Dictionary, ByVa
14681530
If sosCache.Exists(entry.embedObjId) Then Set cntrObj = sosCache(entry.embedObjId)
14691531
End If
14701532
If cntrObj Is Nothing Then ' not in cache or no cache provided
1471-
Set cntrObj = loadObject(content, xrefTable, entry.embedObjId)
1533+
Set cntrObj = loadObject(entry.embedObjId)
14721534
If Not sosCache Is Nothing Then Set sosCache(entry.embedObjId) = cntrObj ' add/update cache
14731535
End If
14741536

@@ -1567,7 +1629,7 @@ Function GetRootObject(ByRef content() As Byte, ByRef trailer As pdfValue, ByRef
15671629
' get either reference or /Root object itself
15681630
Set root = GetRoot(trailer)
15691631
If root.valueType = PDF_ValueType.PDF_Reference Then
1570-
Set root = loadObject(content, xrefTable, root.value)
1632+
Set root = loadObject(root.value)
15711633
'ElseIf root.valueType = PDF_ValueType.PDF_Object Then
15721634
End If
15731635

@@ -1607,7 +1669,7 @@ Sub GetObjectsInTree(ByRef root As pdfValue, ByRef content() As Byte, ByRef xref
16071669
Case PDF_ValueType.PDF_Reference
16081670
' we need to load object
16091671
If Not objects.Exists(CLng(root.value)) Then
1610-
Set obj = loadObject(content, xrefTable, root.value)
1672+
Set obj = loadObject(root.value)
16111673
objects.Add CLng(root.value), obj
16121674
GetObjectsInTree obj, content, xrefTable, objects
16131675
End If
0 Bytes
Binary file not shown.

src/pdfLib.xlsm/Forms/ufPdfInfo.frm

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ Option Explicit
1818
Public pdfDoc As pdfDocument
1919

2020
Private Sub cbContinue_Click()
21-
pdfDoc.renumberIds pdfDoc.trailer, 100
22-
pdfDoc.savePdfAs "renumberd.pdf"
23-
Stop
21+
'pdfDoc.renumberIds pdfDoc.trailer, 100
22+
'pdfDoc.savePdfAs "renumberd.pdf"
23+
'Stop
2424
Me.Hide
2525
Unload Me
2626
End Sub
0 Bytes
Binary file not shown.

src/pdfLib.xlsm/Modules/Main.bas

Lines changed: 27 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,19 @@ Attribute VB_Name = "Main"
22
Option Explicit
33

44

5+
' simple function lets user pick files to combine
6+
Public Sub PickAndCombinePdfFiles()
7+
Dim files() As String
8+
files = PickFiles()
9+
Dim ufFileOrder As ufFileList: Set ufFileOrder = New ufFileList
10+
ufFileOrder.list = files
11+
ufFileOrder.Show
12+
files = ufFileOrder.list
13+
CombinePDFs files, "combined.pdf"
14+
End Sub
15+
16+
517
Public Sub CombinePDFs(ByRef sourceFiles() As String, ByRef outFile As String)
6-
'Dim oPages As pdfValue ' /Type /Pages with /Count # /Kids [ /Page references ...]
718
Dim combinedPdfDoc As pdfDocument: Set combinedPdfDoc = New pdfDocument
819
' initialize with some basic structures
920
combinedPdfDoc.AddInfo
@@ -17,15 +28,8 @@ Public Sub CombinePDFs(ByRef sourceFiles() As String, ByRef outFile As String)
1728
Dim outputFileNum As Integer
1829
outputFileNum = combinedPdfDoc.SavePdfHeader(outFile, offset)
1930

20-
Dim baseId As Long
21-
' we don't yet know the id's used by our /Root, /Info and top level /Pages objs
22-
With combinedPdfDoc
23-
.rootCatalog.id = -1
24-
.Info.id = -2
25-
.pages.id = -3
26-
27-
baseId = .nextObjId
28-
End With
31+
' get where to start renumbering objs in pdf, we need to skip past our toplevel /Root, /Info, /Pages, & /Outlines
32+
Dim baseId As Long: baseId = combinedPdfDoc.nextObjId
2933

3034
Dim ndx As Long
3135
For ndx = LBound(sourceFiles) To UBound(sourceFiles)
@@ -37,35 +41,28 @@ Public Sub CombinePDFs(ByRef sourceFiles() As String, ByRef outFile As String)
3741
pdfDoc.loadPdf sourceFiles(ndx)
3842
pdfDoc.parsePdf
3943

40-
' for each additional document we need to update /Pages
41-
Dim pages As pdfValue
42-
Set pages = pdfDoc.pages()
43-
44-
' since we are about to remove it, we use 1st pdf doc's /Root id for our new top level /Pages (so we have for /Parent references)
45-
' Note: once we add pdfDoc.pages to combinedPdfDoc.pages and save, we can no longer change id of combinedPdfDoc.pages
46-
With combinedPdfDoc
47-
If .pages.id < 0 Then
48-
.pages.id = pdfDoc.rootCatalog.id
49-
End If
50-
End With
51-
52-
' remove /Root object, we need to copy/merge some optional fields such as /Outline for bookmarks
53-
' and ensure only left with 1 /Root
54-
pdfDoc.objectCache.Remove pdfDoc.rootCatalog.id
55-
' also need to remove /Info from cache
56-
If pdfDoc.objectCache.Exists(pdfDoc.Info.id) Then
57-
pdfDoc.objectCache.Remove pdfDoc.Info.id
58-
End If
59-
44+
' adjust obj id's so no conflict with previously stored ones
45+
pdfDoc.renumberIds baseId
46+
6047
' we inject a new top level /Pages object which we add all the document /Pages to
6148
' so we need to add this /Pages to our top level /Pages and add a /Parent indirect reference
6249
combinedPdfDoc.AddPages pdfDoc.pages
6350

51+
' we need to copy/merge some optional fields such as /Outline for bookmarks
6452
If pdfDoc.rootCatalog.hasKey("/Outlines") Then
6553
' hack for now, just copy over
6654
combinedPdfDoc.rootCatalog.asDictionary().Add "/Outlines", pdfDoc.rootCatalog.asDictionary().Item("/Outlines")
6755
End If
6856

57+
' remove /Root object and ensure only left with 1 /Root
58+
' Warning: objectCache is used to convert references to objs, so do not attempt to retrieve any
59+
' objects via obj reference after removing them from objectCache
60+
pdfDoc.objectCache.Remove pdfDoc.rootCatalog.id
61+
' also need to remove /Info from cache
62+
If pdfDoc.objectCache.Exists(pdfDoc.Info.id) Then
63+
pdfDoc.objectCache.Remove pdfDoc.Info.id
64+
End If
65+
' and now save all the pages and other non-toplevel objects to our combined document
6966
combinedPdfDoc.SavePdfObjects outputFileNum, pdfDoc.objectCache, offset
7067

7168
' determine highest id used, 1st obj in next file will start at this + 1
@@ -75,12 +72,6 @@ Public Sub CombinePDFs(ByRef sourceFiles() As String, ByRef outFile As String)
7572
DoEvents
7673
Next ndx
7774

78-
' we need to set valid id's for our top level objs
79-
With combinedPdfDoc
80-
.Info.id = .nextObjId
81-
.rootCatalog.id = .nextObjId
82-
End With
83-
8475
' save updated /Pages object (but not nested objects as already saved)
8576
combinedPdfDoc.SavePdfObject outputFileNum, combinedPdfDoc.pages, offset
8677
combinedPdfDoc.SavePdfObject outputFileNum, combinedPdfDoc.rootCatalog, offset
@@ -92,18 +83,6 @@ Public Sub CombinePDFs(ByRef sourceFiles() As String, ByRef outFile As String)
9283
End Sub
9384

9485

95-
' simple function lets user pick files to combine
96-
Public Sub PickAndCombinePdfFiles()
97-
Dim files() As String
98-
files = PickFiles()
99-
Dim ufFileOrder As ufFileList: Set ufFileOrder = New ufFileList
100-
ufFileOrder.list = files
101-
ufFileOrder.Show
102-
files = ufFileOrder.list
103-
CombinePDFs files, "combined.pdf"
104-
End Sub
105-
106-
10786
'Create a FileDialog object as a File Picker dialog box and returns String array of files selected.
10887
Function PickFiles(Optional ByVal AllowMultiSelect As Boolean = True) As String()
10988
Dim fd As FileDialog

src/pdfLib.xlsm/Modules/toFromBytes.bas

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Function GetWord(ByRef bytes() As Byte, ByRef offset As Long) As String
137137
errHandler:
138138
Debug.Print "Error: " & Err.Description & " (" & Err.Number & ")"
139139
Stop
140+
Resume
140141
End Function
141142

142143

0 commit comments

Comments
 (0)