-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnki.pinnacle.unarchiver.py
294 lines (250 loc) · 10.6 KB
/
nki.pinnacle.unarchiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# pip install pyodbc
import pyodbc,configparser,sys,os,time,collections,re,tarfile
#settings
overwrite = False #overwrite if previous dump in local_pinnacle_dir found
epidclin_basedir = r"W:\Epidclin\EPIDOS\ECR\EVP.INIs\ALL"
local_pinnacle_dir = r"Z:\brent\pinnacle_dump"
local_pinnacle_dbname = "pinnacle_local" #in your datasources.ini
pinnacle_pacs = r"\\172.19.36.17\archive\Kliniek"
zexe = r"D:\postdoc\code\7za.exe"
internal_untar = False #is slower, but no deps.
sqlquery="""SELECT i.ida, t.Field_Name, s.site_name
FROM (
ident i
JOIN txfield t ON i.pat_id1 = t.pat_id1
JOIN site s ON i.pat_id1 = s.Pat_ID1
)
WHERE t.last_tx_dttm > '2016-07-01'
AND t.last_tx_dttm < '2017-07-01' --after last treatment, takes bout a month to be archived and available
AND t.Type_Enum = 13 --VMAT
AND t.MachineCharID IN (262,265,275) --agility
AND t.IsFFF = 0
AND t.Version = 0 -- version=0 is momenteel 'actieve' plan (beam). dit kan dus veranderen! todo: optioneel maken
AND s.Version = 0
AND t.FLD_ID IN (SELECT fld_id FROM txfieldpoint WHERE energy = 6)
AND s.Site_Name LIKE '%<%>%' --w/o upi are nonlinac treatments
AND i.ida NOT LIKE 'Z%' --Z is not clinical
AND t.Field_Name <> 'NIET' --filthy clinicians dont clean up their messes
AND s.site_name <> 'NIET' --filthy clinicians
AND ISNUMERIC(i.ida) = 1 --normal clinical plans that are irriadiated on patients are numeral only, with letters is for research
ORDER BY i.ida, t.Field_Name, s.site_name ASC;
"""
####################################################################################
# Do not edit below this line
####################################################################################
#LPDB constants
lpdb_header = """SiteName = "AvL/NKI Amsterdam";
InstitutionList ={
Institution ={"""
lpdb_footer = """ };
};
DefaultInstitution = "Institution_0";
UseBestMountPoint = 0;
DefaultBackupIODevice = "";
NextUniqueInstitutionID = 99;
NextUniquePatientID = 99;
NextUniqueBackupID = 0;
RestoreSolarisData = 1;
CreateNewInstitution = 0;
BackupRestoreDest = "UNIX File";
BackupIndexSortType = "Backup Volume";
TypeOfBackup = "Manual";
BackupCompressed = 0;
BackupDebugOutputEnabled = 0;
AtJobId = "";
AtJobHost = "";
UnattendedBackupSet = 0;
PrintReport = 0;
AddHeaderInfoToIndex = 0;
ReadMultipleHeaders = 0;
TempRestoreDir = "/tmp";
GenerateHTMLIndex = 0;
BackupVolumeNameList ={
};
RestoreSpaceThreshold = 99999.9;
NumBackupInstitutionFiles = 200;
MaxAgeOfBackupInstitutionFiles = 180;
"""
#/* 6Ñ */
#"""
# Start program
start_time = time.time()
sys.stderr.write("Welcome to the Pinnacle Unarchiver! Please stand by while your query is run.\n")
if not os.path.isdir(local_pinnacle_dir):
os.makedirs(local_pinnacle_dir)
lpdbfile = os.path.join(local_pinnacle_dir,'LPDB')
#run sql query
cnxn = pyodbc.connect(
r'DRIVER={ODBC Driver 11 for SQL Server};'
r'SERVER=rtmquery.radim.local\mquery;'
# r'DATABASE=;'
r'UID=epid-dos;'
r'PWD=ep1d-d0s'
)
cursor = cnxn.cursor()
cursor.execute(sqlquery)
rows = cursor.fetchall()
studydata = collections.defaultdict(dict)
mrn_fails = set()
parser_errors = []
sys.stderr.write(str(len(rows))+" results in sql query.\n")
#NOOT: currently active patients are NOT YET in the archive. Only when they are deactivated (i.e. their treatment is over), do they appear in the archive.
#find pinnacle urls
with open(os.path.join(local_pinnacle_dir,'sql.results'),'w') as sqllog:
for row in rows:
#sqllog.write('%s \t %s \t %s' % (row.ida, row.Field_Name, row.site_name))
#nodig voor testsetrunner: ida, upi, in epidos_sql_db: patientid, upi
upi = str(row.site_name.split('<')[1].split('>')[0])
mrn = str(row.ida)
filename = os.path.join( epidclin_basedir, mrn+".evp.ini")
try:
assert( os.path.isfile(filename) ) #configparser does not throw errors when file not found
urls=[] #there will be multiple beams(/fields) for a matching TP
try:
ini1 = configparser.ConfigParser(comment_prefixes=('#', ';', '=')) #add = to ignore keyless entries (yeah...)
ini1.read(filename)
#alleen fieldurls (==beamurls) nodig voor dosiadump
for s in ini1.sections():
try:
if mrn in ini1[s]['FieldUrl'] and 'Epid' not in ini1[s]['FieldUrl'] and ini1[s]['Plan'] == upi:
#check if LowEpidISOCDosecGy or EpidISOCDosecGy keys exists, if so if more than 0, THEN ignore (because EPID field)
try:
if float(ini1[s]['EpidISOCDosecGy']) > 0:
break #next loop
except:
pass #no key or float conv error, no prob
try:
if float(ini1[s]['LowEpidISOCDosecGy']) > 0:
break #next loop
except:
pass #no key or float conv error, no prob
urls.append(ini1[s]['FieldUrl'])
sqllog.write('%s\t%s\t%s\n' % (row.ida, row.Field_Name, row.site_name))
sqllog.write(ini1[s]['FieldUrl']+'\n')
except KeyError:
pass #skip this section
if len(urls) == 0:
raise FileNotFoundError("No Pinnacle Urls for mrn "+mrn+". Skipping...\n")
#studydata[mrn][upi][urls]=urls
#studydata[mrn].update({upi:urls})
studydata[mrn][upi]=urls #other upis with urls may be added later/exist already
except configparser.ParsingError:
parser_errors.append(mrn)
#sys.stderr.write("No entry in epinclin data found for mrn: "+mrn+". Skipping...\n")
except AssertionError:
mrn_fails.update({mrn})
#sys.stderr.write("No entry in epinclin data found for mrn: "+mrn+". Skipping...\n")
except FileNotFoundError as e:
mrn_fails.update({mrn})
#sys.stderr.write(str(e))
except ValueError as e:
mrn_fails.update({mrn})
sys.stderr.write(str(e))
sys.stderr.write(str(len(parser_errors))+" parser errors and skipped evp files.\n")
sys.stderr.write(str(len(mrn_fails))+" mrns had no or missing data.\n")
sys.stderr.write(str(len(studydata))+" mrns with full data found.\n")
skipped_arcfiles = []
#first create list of archive mrn folders, then only take latest (it should contain all older info!)
for root, dirs, files in os.walk(pinnacle_pacs):
#sys.stderr.write(files)
for filename in files:
filemrn = filename.split('.')[0]
#find if theres an archive for this mrn
if filemrn in studydata:
new_arcfile = os.path.join(root, filename)
#try:
#studydata[filemrn]['arcfile'] #if does not exist, except
if 'arcfile' in studydata[filemrn]:
#we already found an archive with this mrn, check for date.
existing_arcfile = studydata[filemrn]['arcfile']
#date is simply the topmost dir (YYYYMMDD)
if int(existing_arcfile.split(os.path.sep)[-2]) > int(new_arcfile.split(os.path.sep)[-2]):
skipped_arcfiles.append(new_arcfile)
break
studydata[filemrn]['arcfile']=new_arcfile
sys.stderr.write(str(len(skipped_arcfiles))+" mrn archive duplicates skipped.\n")
#remove MRN if no archive found in previous loop
no_archives_found=[]
for mrn in list(studydata.keys()):
try:
studydata[mrn]['arcfile']
except KeyError: #no file found in previous loop, then we remove the mrn because nothing to study.
del studydata[mrn]
no_archives_found.append(mrn)
sys.stderr.write(str(len(no_archives_found))+" mrns did not have a file in the pinnacle archive. Skipping them from study.\n")
#dump the archive to local pinnacle dir
skip_dump=[]
inst_files=[]
for mrn,subdicts in studydata.items():
#even when skipping dump, stil generating new LPDB file so need to fill inst_files
inst_files.append(os.path.join(local_pinnacle_dir,str(mrn)+'Institution'))
if os.path.isfile(os.path.join(local_pinnacle_dir,str(mrn)+'Institution')) and overwrite == False:
skip_dump.append(mrn)
#sys.stderr.write("Mrn "+mrn+" already dumped, skipping dump...\n")
continue
else:
filename = subdicts['arcfile']
sys.stderr.write("Found mrn file "+filename+" in Pinnacle archive, copying...\n")
#use 7zip because its much faster and autorenames illegal chars (dont worry bout errors)
if internal_untar is False:
os.system(zexe+' x '+filename+ ' -so | '+zexe+' x -aoa -si -ttar -o'+local_pinnacle_dir)
if internal_untar:
#use python internal untar, no deps but slower.
with tarfile.open(os.path.join(root, filename),'r') as tarball:
for f in tarball: #replace invalid chars like tar to underscores
f.name = re.sub(r'[:]', '_', f.name)
tarball.extractall(local_pinnacle_dir)
#all is extracted in same dir. rescue Institution files because we need them later.
os.rename(os.path.join(local_pinnacle_dir,'Institution'),os.path.join(local_pinnacle_dir,str(mrn)+'Institution'))
sys.stderr.write(str(len(skip_dump))+" mrn dumps skipped because existing data encountered.\n")
#make lpdb file
with open(lpdbfile, 'w') as dest_file:
dest_file.write(lpdb_header)
firstFile = True #copy only first Institution file
lastFile = False
for i,inst_file in enumerate(inst_files):
if i+1 == len(inst_files):
lastFile = True
with open(inst_file,'r',errors='ignore') as inst_file_content:
writeline = False
if firstFile:
writeline = True
for line in inst_file_content:
if 'PatientLite ={' in line:
writeline=True
if writeline:
if 'FormattedDescription' in line:
FormattedDescription = "DEMO&&IMRT&&&&-1&&LGR LongR 20130402&&2013-05-14 09:24:45";
dest_file.write(line)
if writeline==True and '};' in line:
writeline=False
if firstFile:
firstFile = False
if lastFile:
writeline = True
#os.remove(inst_file) #dont remove, is proof of dump for possible next run
dest_file.write(lpdb_footer)
#update pinnacls_urls to local url and dump to disk for further use
# NOOT!!!! Pinnacle updates Patientnumber (NOT MRN) every time a patient comes back. All previous patient numbers are updated to the new one. The EPID archives DO NOT DO THIS, so we must find the latest patnr and update old URLs obtained in the epidclin_basedir.
newurls=[]
for mrn,subdicts in studydata.items():
for upi, urls in subdicts.items():
if upi == 'arcfile':
continue
new_pat_id = None
with open(os.path.join(local_pinnacle_dir,str(mrn)+'Institution'),'r',errors='ignore') as inst_file_content:
for line in inst_file_content:
if 'PatientID =' in line:
new_pat_id = re.findall('[0-9]+',line)[0] #should be only one nr.
break #done, should be only 1
#loop over urls, update to new dbname and latest patnr
for url in urls:
new_url = re.sub('\[(.*)\]', '['+local_pinnacle_dbname+']', url,1) #replace pinnacle db name
new_url = re.sub('\_(.\d+)\.', '_'+new_pat_id+'.',new_url,1) #replace with correct patid
newurls.append(new_url+'\n')
if mrn not in url:
sys.stderr.write("mrn/upi mismatch: "+mrn+", "+upi+", "+url+"\n")
with open(os.path.join(local_pinnacle_dir,'purls.txt'),'w') as purls:
purls.writelines(newurls) #doesnt do newlines
end_time = time.time()
sys.stderr.write("Task completed. Total runtime: "+str(end_time-start_time)+"\n")