Skip to content
This repository was archived by the owner on Jul 22, 2024. It is now read-only.

Replaces existing SED and PY file #61

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions data/rtf-to-csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Converts all rtf files in a given folder into a single CSV with
# a file name column. This is intended for use in preprocessing
# RTF files typically used by the medical industry for consumption
# outside of Windows.
#
# This requires the striprtf module to be installed.
from __future__ import print_function
from striprtf.striprtf import rtf_to_text

import csv
import os
import sys

def main(folder):
replacement_dict = {'{period}':'.', '{comma}':',', 'YYYY': '',
'yyyy':'', 'xxx':'', 'xx':'', 'XXXXX': '',
'xxxx':'', '.Next':'. Next', 'HEENT:':'HEENT :',
'[skip\]':'', 'dication': 'dictation', 'wsould': 'would'
}
trans_dict = {'Filename': 'Text'}
all_files = os.listdir(folder)
for file in all_files:
if file.endswith('.rtf'):
filename = file.replace('.rtf', '.wav')
with open(file) as source:
text = rtf_to_text(source.read())
for i, j in replacement_dict.items():
text = text.replace(i, j)
trans_dict = {filename : text}
with open(folder + '/documents.csv', 'a') as f:
w = csv.writer(f)
for k, v in trans_dict.items():
w.writerow([k, v])

if __name__ == "__main__":
if len(sys.argv) !=2:
print('Converts a folder of RTF\'s into a single CSV.')
print('Usage: %s folder path' % sys.argv[0])
else:
main(sys.argv[1])