Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option --naming for different line name patterns #184

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions ocropus-gpageseg
Original file line number Diff line number Diff line change
@@ -78,6 +78,8 @@ group_column.add_argument('--csminaspect',type=float,default=1.1,

# output parameters
group_output = parser.add_argument_group('output parameters')
group_output.add_argument('--naming',default=False,
help='change naming of the lines with an own pattern, e.g. "--naming %%04d" for four decimal digits, default: %(default)s')
group_output.add_argument('--gray',action='store_true',
help='output grayscale lines as well, default: %(default)s')
group_output.add_argument('-p','--pad',type=int,default=3,
@@ -424,12 +426,17 @@ def process1(job):
lines = [lines[i] for i in lsort]
ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)
cleaned = ocrolib.remove_noise(binary,args.noise)
pattern = "01%04x"
if (args.naming):
pattern = args.naming
pattern = "%s/"+pattern
print_info("with file name pattern",pattern)
for i,l in enumerate(lines):
binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)
ocrolib.write_image_binary(pattern%(outputdir,i+1)+".bin.png",binline)
if args.gray:
grayline = psegutils.extract_masked(gray,l,pad=args.pad,expand=args.expand)
ocrolib.write_image_gray("%s/01%04x.nrm.png"%(outputdir,i+1),grayline)
ocrolib.write_image_gray(pattern%(outputdir,i+1)+".nrm.png",grayline)
print_info("%6d %s %4.1f %d" % (i, fname, scale, len(lines)))

if len(args.files)==1 and os.path.isdir(args.files[0]):
14 changes: 10 additions & 4 deletions ocropus-hocr
Original file line number Diff line number Diff line change
@@ -24,6 +24,8 @@ For each page like 'book/0001.bin.png', it uses the following files:
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument('--naming',default=False,
help='indicate naming of the lines with an own pattern, e.g. "--naming %%04d" for four decimal digits, default: %(default)s')
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
@@ -43,13 +45,13 @@ def PN(*args):
E("writing to",args.output)
median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
xhfiles = python.sum([glob.glob(d+"/*.xheight") for d in dirs],[])
if len(xhfiles)>5:
xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
if len(xheights)>0:
median_xheight = median(xheights)
else:
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
lfiles = python.sum([glob.glob(d+"/*.bin.png") for d in dirs],[])
pyrandom.shuffle(lfiles)
if len(lfiles)>0:
median_xheight = 0.5*median([imread(f).shape[0] for f in lfiles[:100]])
@@ -106,8 +108,12 @@ for arg in args.files:
last_coords = (x0,y0)

# get the text for the line itself

lbase = "%s/%06x"%(base,id)
pattern = "01%04x"
if (args.naming):
pattern = args.naming
pattern = "%s/"+pattern
id=id-0x010000
lbase = pattern%(base,id)

if not os.path.exists(lbase+".txt"):
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
2 changes: 1 addition & 1 deletion ocropus-visualize-results
Original file line number Diff line number Diff line change
@@ -46,7 +46,7 @@ def genpage(d):
os.chdir(d)
with open("index.html","w") as stream:
stream.write("<h1>%s</h1>\n"%d)
images = sorted(glob.glob("??????.bin.png"))
images = sorted(glob.glob("*.bin.png"))
for img in images:
txt = ocrolib.fvariant(img,"txt","")
if os.path.exists(txt):