Skip to content

Commit c73f2ef

Browse files
committed
Added replace command.
1 parent 4ba5544 commit c73f2ef

File tree

3 files changed

+350
-1
lines changed

3 files changed

+350
-1
lines changed

README.md

+22-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ powered by the [UniPDF](https://github.com/unidoc/unipdf) PDF library.
2424
- [Extract text from PDF files](#extract-text)
2525
- [Extract images from PDF files](#extract-images)
2626
- [Search text in PDF files](#search)
27+
- [Replace text in PDF files](#replace)
2728
- [Export PDF form fields as JSON](#form-export)
2829
- [Fill PDF form fields from JSON file](#form-fill)
2930
- [Fill PDF form fields from FDF file](#fdf-merge)
@@ -36,7 +37,7 @@ powered by the [UniPDF](https://github.com/unidoc/unipdf) PDF library.
3637

3738
## Installation
3839

39-
Minimum required Go version: 1.13. We officially support the 3 latest minor versions of Go, but it may work on earlier ones as well.
40+
Minimum required Go version: 1.18. We officially support the 3 latest minor versions of Go, but it may work on earlier ones as well.
4041

4142
```
4243
git clone [email protected]:unidoc/unipdf-cli.git
@@ -372,6 +373,26 @@ unipdf search input_file.pdf text_to_search
372373
unipdf search -p pass input_file.pdf text_to_search
373374
```
374375

376+
#### Replace
377+
378+
Replace text in PDF files.
379+
380+
```
381+
unipdf replace [FLAG]... INPUT_FILE TEXT
382+
383+
Flags:
384+
-o, --output-file string output file
385+
-r, - replace-text string replacement text
386+
-p, --password string PDF file password
387+
388+
Examples:
389+
unipdf replace input_file.pdf text_to_search
390+
unipdf replace -o output_file.pdf input_file.pdf text_to_search
391+
unipdf replace -o output_file.pdf -r replacement_text input_file.pdf text_to_search
392+
unipdf replace -o output_file.pdf -r replacement_text -p pass input_file.pdf text_to_search
393+
```
394+
395+
375396
#### Form Export
376397

377398
Export JSON representation of form fields.

internal/cli/replace.go

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* This file is subject to the terms and conditions defined in
3+
* file 'LICENSE.md', which is part of this source code package.
4+
*/
5+
6+
package cli
7+
8+
import (
9+
"errors"
10+
"fmt"
11+
12+
"github.com/spf13/cobra"
13+
"github.com/unidoc/unipdf-cli/pkg/pdf"
14+
)
15+
16+
const replaceCmdDesc = `Replace text in PDF files`
17+
18+
var replaceCmdExample = fmt.Sprintf("%s\n%s\n%s\n%s\n",
19+
fmt.Sprintf("%s replace input_file.pdf text_to_search", appName),
20+
fmt.Sprintf("%s replace -o output_file input_file.pdf text_to_search", appName),
21+
fmt.Sprintf("%s replace -o output_file -r new_text input_file.pdf text_to_search", appName),
22+
fmt.Sprintf("%s replace -o output_file -r new_text -p pass input_file.pdf text_to_search", appName),
23+
)
24+
25+
// replaceCmd represents the replace command.
26+
var replaceCmd = &cobra.Command{
27+
Use: "replace [FLAG]... INPUT_FILE TEXT",
28+
Short: "Replace text in PDF files",
29+
Long: replaceCmdDesc,
30+
Example: replaceCmdExample,
31+
DisableFlagsInUseLine: true,
32+
Run: func(cmd *cobra.Command, args []string) {
33+
// Parse input parameters.
34+
inputPath := args[0]
35+
text := args[1]
36+
password, _ := cmd.Flags().GetString("password")
37+
38+
// Parse output file.
39+
outputPath, _ := cmd.Flags().GetString("output-file")
40+
if outputPath == "" {
41+
outputPath = inputPath
42+
}
43+
44+
// Parse replaceText.
45+
replaceText, _ := cmd.Flags().GetString("replace-text")
46+
if replaceText == "" {
47+
replaceText = text
48+
}
49+
50+
// Search text.
51+
err := pdf.Replace(inputPath, outputPath, text, replaceText, password)
52+
if err != nil {
53+
printErr("Could not replace the specified text: %s\n", err)
54+
}
55+
56+
fmt.Printf("Successfully replaced text %s with %s\n", text, replaceText)
57+
fmt.Printf("Output file saved to %s\n", outputPath)
58+
},
59+
Args: func(_ *cobra.Command, args []string) error {
60+
if len(args) < 2 {
61+
return errors.New("must provide a PDF file and the text to search")
62+
}
63+
64+
return nil
65+
},
66+
}
67+
68+
func init() {
69+
rootCmd.AddCommand(replaceCmd)
70+
71+
replaceCmd.Flags().StringP("output-file", "o", "", "output file")
72+
replaceCmd.Flags().StringP("replace-text", "r", "", "replacement text")
73+
replaceCmd.Flags().StringP("password", "p", "", "input file password")
74+
}

pkg/pdf/replace.go

+254
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
/*
2+
* This file is subject to the terms and conditions defined in
3+
* file 'LICENSE.md', which is part of this source code package.
4+
*/
5+
6+
package pdf
7+
8+
import (
9+
"strings"
10+
11+
"github.com/unidoc/unipdf/v3/common"
12+
"github.com/unidoc/unipdf/v3/contentstream"
13+
"github.com/unidoc/unipdf/v3/core"
14+
"github.com/unidoc/unipdf/v3/model"
15+
unipdf "github.com/unidoc/unipdf/v3/model"
16+
)
17+
18+
type textChunk struct {
19+
font *model.PdfFont
20+
strObj *core.PdfObjectString
21+
val string
22+
idx int
23+
}
24+
25+
func (tc *textChunk) encode() {
26+
var encoded string
27+
if font := tc.font; font != nil {
28+
encodedBytes, numMisses := font.StringToCharcodeBytes(tc.val)
29+
if numMisses != 0 {
30+
common.Log.Debug("WARN: some runes could not be encoded.\n\t%s -> %v")
31+
}
32+
encoded = string(encodedBytes)
33+
}
34+
35+
*tc.strObj = *core.MakeString(encoded)
36+
}
37+
38+
type textChunks struct {
39+
text string
40+
chunks []*textChunk
41+
}
42+
43+
func (tc *textChunks) replace(search, replacement string) {
44+
text := tc.text
45+
chunks := tc.chunks
46+
47+
// Steps:
48+
// 1. Search for the first index of the search term in the text.
49+
// 2. Use the found index to match the text chunk which contains
50+
// (or partly contains) the search term.
51+
// 3. Replace the search term in the found text chunk. The search term
52+
// will not always start at the beginning of the text chunk. Also,
53+
// the search term could be split in multiple text chunks. If that's
54+
// the case, replace the portion of the search term in the found
55+
// chunk and continue removing characters from the following chunks
56+
// until the search term has been completely erased.
57+
// 4. Offset the text chunks slice to the last processed text chunk from
58+
// the previous step, if the text chunk was not completely erased, or
59+
// to the next one otherwise. This is necessary so that the visited
60+
// text chunks are skipped when searching for the next occurrence of the
61+
// search term.
62+
// 5. Discard the part of the text up to (and including) the index found
63+
// in step one.
64+
// 6. Move to step 1 in order to search for the search term in the remaining
65+
// text.
66+
var chunkOffset int
67+
matchIdx := strings.Index(text, search)
68+
for currMatchIdx := matchIdx; matchIdx != -1; {
69+
for i, chunk := range chunks[chunkOffset:] {
70+
idx, lenChunk := chunk.idx, len(chunk.val)
71+
if currMatchIdx < idx || currMatchIdx > idx+lenChunk-1 {
72+
continue
73+
}
74+
chunkOffset += i + 1
75+
76+
start := currMatchIdx - idx
77+
remaining := len(search) - (lenChunk - start)
78+
79+
replaceVal := chunk.val[:start] + replacement
80+
if remaining < 0 {
81+
replaceVal += chunk.val[lenChunk+remaining:]
82+
chunkOffset--
83+
}
84+
85+
chunk.val = replaceVal
86+
chunk.encode()
87+
88+
for j := chunkOffset; remaining > 0; j++ {
89+
c := chunks[j]
90+
l := len(c.val)
91+
92+
if l > remaining {
93+
c.val = c.val[remaining:]
94+
} else {
95+
c.val = ""
96+
chunkOffset++
97+
}
98+
99+
c.encode()
100+
remaining -= l
101+
}
102+
103+
break
104+
}
105+
106+
text = text[matchIdx+1:]
107+
matchIdx = strings.Index(text, search)
108+
currMatchIdx += matchIdx + 1
109+
}
110+
111+
tc.text = strings.Replace(tc.text, search, replacement, -1)
112+
}
113+
114+
// Replace searches the provided text in the PDF file specified by the inputPath
115+
// parameter and replaces it by the newText. A password can be passed in for encrypted input files.
116+
// The result is saved to outputPath.
117+
func Replace(inputPath, outputPath, text, replaceText, password string) error {
118+
// Read input file.
119+
r, pages, _, _, err := readPDF(inputPath, password)
120+
if err != nil {
121+
return err
122+
}
123+
124+
w := unipdf.NewPdfWriter()
125+
126+
// Search specified text.
127+
for i := 0; i < pages; i++ {
128+
// Get page.
129+
numPage := i + 1
130+
131+
page, err := r.GetPage(numPage)
132+
if err != nil {
133+
return err
134+
}
135+
136+
err = searchReplacePageText(page, text, replaceText)
137+
if err != nil {
138+
return err
139+
}
140+
141+
err = w.AddPage(page)
142+
if err != nil {
143+
return err
144+
}
145+
}
146+
147+
// Write output file.
148+
safe := inputPath == outputPath
149+
return writePDF(outputPath, &w, safe)
150+
}
151+
152+
func searchReplacePageText(page *model.PdfPage, searchText, replaceText string) error {
153+
contents, err := page.GetAllContentStreams()
154+
if err != nil {
155+
return err
156+
}
157+
158+
ops, err := contentstream.NewContentStreamParser(contents).Parse()
159+
if err != nil {
160+
return err
161+
}
162+
163+
// Generate text chunks.
164+
var currFont *model.PdfFont
165+
tc := textChunks{}
166+
167+
textProcFunc := func(objptr *core.PdfObject) {
168+
strObj, ok := core.GetString(*objptr)
169+
if !ok {
170+
common.Log.Debug("Invalid parameter, skipping")
171+
return
172+
}
173+
174+
str := strObj.String()
175+
if currFont != nil {
176+
decoded, _, numMisses := currFont.CharcodeBytesToUnicode(strObj.Bytes())
177+
if numMisses != 0 {
178+
common.Log.Debug("WARN: some charcodes could not be decoded.\n\t%v -> %s", strObj.Bytes(), decoded)
179+
}
180+
str = decoded
181+
}
182+
183+
tc.chunks = append(tc.chunks, &textChunk{
184+
font: currFont,
185+
strObj: strObj,
186+
val: str,
187+
idx: len(tc.text),
188+
})
189+
tc.text += str
190+
}
191+
192+
processor := contentstream.NewContentStreamProcessor(*ops)
193+
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
194+
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
195+
switch op.Operand {
196+
case `Tj`, `'`:
197+
if len(op.Params) != 1 {
198+
common.Log.Debug("Invalid: Tj/' with invalid set of parameters - skip")
199+
return nil
200+
}
201+
textProcFunc(&op.Params[0])
202+
case `''`:
203+
if len(op.Params) != 3 {
204+
common.Log.Debug("Invalid: '' with invalid set of parameters - skip")
205+
return nil
206+
}
207+
textProcFunc(&op.Params[3])
208+
case `TJ`:
209+
if len(op.Params) != 1 {
210+
common.Log.Debug("Invalid: TJ with invalid set of parameters - skip")
211+
return nil
212+
}
213+
arr, _ := core.GetArray(op.Params[0])
214+
for i := range arr.Elements() {
215+
obj := arr.Get(i)
216+
textProcFunc(&obj)
217+
arr.Set(i, obj)
218+
}
219+
case "Tf":
220+
if len(op.Params) != 2 {
221+
common.Log.Debug("Invalid: Tf with invalid set of parameters - skip")
222+
return nil
223+
}
224+
225+
fname, ok := core.GetName(op.Params[0])
226+
if !ok || fname == nil {
227+
common.Log.Debug("ERROR: could not get font name")
228+
return nil
229+
}
230+
231+
fObj, has := resources.GetFontByName(*fname)
232+
if !has {
233+
common.Log.Debug("ERROR: font %s not found", fname.String())
234+
return nil
235+
}
236+
237+
pdfFont, err := model.NewPdfFontFromPdfObject(fObj)
238+
if err != nil {
239+
common.Log.Debug("ERROR: loading font")
240+
return nil
241+
}
242+
currFont = pdfFont
243+
}
244+
245+
return nil
246+
})
247+
248+
if err = processor.Process(page.Resources); err != nil {
249+
return err
250+
}
251+
252+
tc.replace(searchText, replaceText)
253+
return page.SetContentStreams([]string{ops.String()}, core.NewFlateEncoder())
254+
}

0 commit comments

Comments
 (0)