1
+ import cv2
2
+ import numpy as np
3
+ import pytesseract
4
+ import os
5
+ import datetime
6
+ import base64
7
+
8
+ # C:\Program Files\Tesseract-OCR\tesseract.exe
9
+ # poppler_path=r'D:\downloads\poppler-0.68.0_x86\poppler-0.68.0\bin'
10
+ poppler_path = r'D:\downloads\poppler-0.68.0_x86\poppler-0.68.0\bin'
11
+ pytesseract .pytesseract .tesseract_cmd = r"C:\Users\madhu\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
12
+
13
+ # Mongodb
14
+ from pymongo import MongoClient
15
+
16
+ # output_file = r'D:\projects\pytesseract\images' ## give images folder path
17
+ output_file = r'D:\downloads\img'
18
+ folder_path = r'D:\downloads\pdf' ## give pdf's folder path
19
+ # creation of MongoClient
20
+ client = MongoClient ()
21
+ he = []
22
+ import urllib .parse
23
+
24
+ Username = 'devops_admin'
25
+ Password = 'Devops1234'
26
+ username = urllib .parse .quote_plus (Username )
27
+ password = urllib .parse .quote_plus (Password )
28
+ # Connect with the portnumber and host
29
+ client = MongoClient ('mongodb://localhost:27017/' )
30
+ # client = MongoClient('mongodb://%s:%[email protected] :27017' % (username,password))
31
+
32
+ # Access database
33
+ mydatabase = client ['Data_conversion' ]
34
+
35
+ # Access collection of the database
36
+ mycollection = mydatabase ['test' ]
37
+ mycollection2 = mydatabase ['testimage' ]
38
+ myimage = mydatabase ['test2' ]
39
+
40
+ # dictionary to be added in the database
41
+ rec = {
42
+ 'title' : 'MongoDB and Python' ,
43
+ 'description' : 'MongoDB is no SQL database' ,
44
+ 'tags' : ['mongodb' , 'database' , 'NoSQL' ],
45
+ 'viewers' : 104
46
+ }
47
+
48
+
49
+ # Time kosam
50
+
51
+ dt = datetime .datetime .now ()
52
+ print (dt )
53
+ lis = ["" ,"" ]
54
+
55
+ count = 0
56
+ def imgetobase (file ):
57
+ global count
58
+ f = f"sample{ count } .png"
59
+ import boto3
60
+ b_name = "ivin-pro-data-conversion"
61
+ s3 = boto3 .client ("s3" )
62
+ b_res = s3 .list_buckets ()
63
+ # for i in b_res['Buckets']:
64
+ # print(i)
65
+
66
+ cv2 .imshow ('Resized' , file )
67
+ img = cv2 .imwrite (f , file )
68
+ cv2 .waitKey (3 )
69
+ count = 1 + count
70
+ with open (f ,'rb' ) as img :
71
+ # s3.upload_fileobj(img,b_name,f)
72
+ return f
73
+
74
+ # def pagetobase(image):
75
+ # image = open(image, 'rb')
76
+ # image_read = image.read()
77
+ # image_64_encode = base64.encodebytes(image_read) #encodestring also works aswell as decodestring
78
+ # # print('This is the image in base64: ' + str(image_64_encode))
79
+ # return str(image_64_encode)
80
+
81
+
82
+
83
+ def page (image ):
84
+ text = pytesseract .pytesseract .image_to_string (image )
85
+ # print(text)
86
+ text .replace ('Photo' , "" )
87
+ # print(text)
88
+ f = open ('text.txt' , 'w' )
89
+ f .write (text + "\n " )
90
+ f .close ()
91
+ fi = open ('text.txt' , 'r' )
92
+ x = fi .readlines ()
93
+ for i in x :
94
+ if "Assembly" in i :
95
+ lis .append (i )
96
+ elif "Section" in i :
97
+ lis .append (i )
98
+ fi .close ()
99
+ # print(lis)
100
+
101
+
102
+
103
+ c = 1
104
+ def covert (x , y , w , h , im2 , img_file ):
105
+ # print(h)
106
+ # try:
107
+ width = int (w / 3 )
108
+ if h >= 400 and h < 1000 :
109
+ # print(h,w)
110
+ width = int (w / 3 )
111
+ # print(width,'--------')
112
+ f = open ('test.txt' , 'a' )
113
+ rect = cv2 .rectangle (im2 , (x , y ), (x + width , y + h ), (0 , 255 , 0 ), 5 )
114
+
115
+ if x == 0 and y == 0 :
116
+ pass
117
+ else :
118
+ cropped = im2 [y :y + h , x :x + width ]
119
+
120
+ # print(cropped)
121
+ # print(w, '----------')
122
+ img = cv2 .resize (rect , (1020 , 750 ))
123
+ cv2 .imshow ('d' , cropped )
124
+ cv2 .waitKey (3 )
125
+ # count=count+1
126
+
127
+
128
+ b = imgetobase (cropped )
129
+ text = pytesseract .image_to_string (cropped )
130
+ f = open ('text.txt' , 'a' )
131
+ f .write (text )
132
+ f .close ()
133
+ # cv2.destroyAllWindows()
134
+ # print('data base error')
135
+
136
+ rec = mycollection2 .insert_one ({
137
+
138
+ 'Voter_file_tracker' : img_file ,
139
+ "Voter_Image" : b ,})
140
+ rec = mycollection .insert_one ({
141
+ 'Voter_file_tracker' : img_file ,
142
+ "votere_slip" :b ,
143
+ "Assembly Name" : lis [0 ],
144
+ "Section" : lis [1 ],
145
+ 'details' : text ,
146
+ 'Created on' : dt })
147
+
148
+ # except:
149
+ # print('convert function exception')
150
+ def covert2 (x , y , w , h , im2 , img_file ):
151
+ if h >= 400 and h < 1000 and w > 400 :
152
+ # print(h,w)
153
+ width2 = int (w / 3 )
154
+ # print(width,'--------')
155
+
156
+ f = open ('test.txt' , 'a' )
157
+ rect = cv2 .rectangle (im2 , (x , y ), (x + width2 , y + h ), (0 , 255 , 255 ), 3 )
158
+ # print(h,width)
159
+ if x == 0 and y == 0 :
160
+ pass
161
+ else :
162
+ cropped = im2 [y :y + h , x :x + width2 ]
163
+ b = imgetobase (cropped )
164
+ # print(w,'----------')
165
+ img = cv2 .resize (rect , (1020 , 750 ))
166
+ cv2 .imshow ('d' , cropped )
167
+ cv2 .waitKey (3 )
168
+ # cv2.destroyWindow()
169
+ text = pytesseract .image_to_string (cropped )
170
+ f = open ('text.txt' , 'a' )
171
+ f .write (text )
172
+ f .close ()
173
+ # cv2.destroyAllWindows()
174
+ # if len(lis)>1:
175
+
176
+
177
+
178
+ rec = mycollection2 .insert_one ({
179
+ 'Voter_file_tracker' : img_file ,
180
+ "Voter_Image" : b , })
181
+ rec = mycollection .insert_one ({
182
+ 'Voter_file_tracker' : img_file ,
183
+ "votere_slip" :b ,
184
+ "Assembly Name" : lis [0 ],
185
+ "Section" : lis [1 ],
186
+ 'details' : text ,
187
+ 'Created on' : dt })
188
+
189
+
190
+ def covert3 (x , y , w , h , im2 , img_file ):
191
+ if h >= 400 and h < 1000 and w > 400 :
192
+ # print(h,w)
193
+ width1 = int (w / 3 )
194
+ # print(width,'--------')
195
+ f = open ('test.txt' , 'a' )
196
+ rect = cv2 .rectangle (im2 , (x , y ), (x + width1 , y + h ), (255 , 255 , 255 ), 3 )
197
+ # print(h,width)
198
+ if x == 0 and y == 0 :
199
+ pass
200
+ else :
201
+ cropped = im2 [y :y + h , x :x + width1 ]
202
+
203
+ # #print(w,'----------')
204
+ img = cv2 .resize (rect , (1020 , 750 ))
205
+ cv2 .imshow ('d' , cropped )
206
+ cv2 .waitKey (3 )
207
+ cv2 .imwrite ("cropped.png" , im2 )
208
+ b = imgetobase (cropped )
209
+ # cv2.destroyWindow()
210
+ text = pytesseract .image_to_string (cropped )
211
+ f = open ('text.txt' , 'a' )
212
+ f .write (text )
213
+ f .close ()
214
+
215
+
216
+
217
+ rec = mycollection2 .insert_one ({
218
+ 'Voter_file_tracker' : img_file ,
219
+ "Voter_Image" : b , })
220
+ rec = mycollection .insert_one ({
221
+ 'Voter_file_tracker' : img_file ,
222
+ "votere_slip" :b ,
223
+ "Assembly Name" : lis [0 ],
224
+ "Section" : lis [1 ],
225
+ 'details' : text ,
226
+ 'Created on' : dt })
227
+
228
+
229
+
230
+ def ima (x , y , w , h , im2 , img_file ):
231
+
232
+ # #print(x,y)
233
+ im2 = cv2 .imread (im2 )
234
+ cv2 .putText (im2 , 'Rectangle' , (x , y ), cv2 .FONT_HERSHEY_SIMPLEX , 0.6 , (0 , 0 , 255 ), 2 )
235
+ # rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 5)
236
+ # img = cv2.resize(rect, (1020, 750))
237
+ # cv2.imshow('d', img)
238
+ #print(x, y)
239
+ covert (x , y , w , h , im2 , img_file )
240
+ W = int (w / 3 )
241
+ covert (x + W , y , w , h , im2 , img_file )
242
+ covert (x + W + W , y , w , h , im2 , img_file )
243
+
244
+
245
+
246
+ area = []
247
+ value = []
248
+
249
+
250
+ # def img_detect(img_path, img_file):
251
+ # lis.clear()
252
+ # page(img_path)
253
+ #
254
+ def img_detect (img_path , img_file ):
255
+
256
+ img = cv2 .imread (img_path )
257
+ gray = cv2 .cvtColor (img , cv2 .COLOR_BGR2GRAY )
258
+ ret , thresh = cv2 .threshold (gray , 50 , 255 , 0 )
259
+ contours , hierarchy = cv2 .findContours (thresh , 1 , 2 )
260
+ for cnt in contours :
261
+ x1 , y1 = cnt [0 ][0 ]
262
+ approx = cv2 .approxPolyDP (cnt , 0.01 * cv2 .arcLength (cnt , True ), True )
263
+ if len (approx ) == 4 :
264
+ x , y , w , h = cv2 .boundingRect (cnt )
265
+
266
+ ratio = float (w ) / h
267
+ if ratio >= 0.9 and ratio <= 1.1 :
268
+ pass
269
+ else :
270
+ area .append ((h * w ))
271
+ value .append ((h , w , x , y ))
272
+ he .append (h )
273
+ for i in range (len (area )):
274
+ # if value[i][1]>=0 and value[i][1]>100:
275
+ xa = value [i ][2 ]
276
+ ya = value [i ][3 ]
277
+ l = value [i ][0 ]
278
+ w = value [i ][1 ]
279
+ # cv2.rectangle(img, (xa, ya), (xa + w, ya + l), (0, 255, 0), 2)
280
+ ima (x = xa , y = ya , w = w , h = l , im2 = img_path , img_file = img_file )
281
+
282
+
283
+ import os
284
+
285
+
286
+ def image_upload (image_file ):
287
+ print ("count of images :" ,len (os .listdir (folder_path )))
288
+ l = os .listdir (image_file )
289
+ path = image_file
290
+ for k in range (len (l )):
291
+ print (k , 'started' )
292
+ area .clear ()
293
+ value .clear ()
294
+ he .clear ()
295
+ img_detect (img_path = f"{ path } \{ l [k ]} " , img_file = l [k ])
296
+ # os.remove(f"{path}\{l[k]}")
297
+ print (k , 'done' )
298
+
299
+
300
+ def converting_pdftoimg (pdf_path ):
301
+ from pdf2image import convert_from_path
302
+ # print(pdf_path)
303
+ images = convert_from_path (pdf_path , 500 , poppler_path = poppler_path )
304
+ name = pdf_path .split ("\\ " )
305
+ print (name )
306
+ for i , image in enumerate (images ):
307
+ print (i , 'is pdf converting into images' )
308
+ fname = f'{ name [- 1 ]} ' + str (i ) + '.png'
309
+ # print(fname)
310
+ image .save (f"{ output_file } \{ fname } " , "PNG" )
311
+
312
+
313
+
314
+ l = os .listdir (folder_path )
315
+ print (l )
316
+ # for i in range(len(l)):
317
+ # print(l[i])
318
+ # converting_pdftoimg(pdf_path=f'{folder_path}\{l[i]}')
319
+ # yesy()
320
+
321
+ image_upload (f'{ output_file } ' )
322
+
323
+
324
+ # dt = datetime.datetime.now()
0 commit comments