Skip to content

Commit e97107a

Browse files
committed
Merge branch 'master' of github.jpl.nasa.gov:OCO/RtRetrievalFramework
2 parents e5b061d + 4a67f30 commit e97107a

File tree

7 files changed

+398440
-30
lines changed

7 files changed

+398440
-30
lines changed

Diff for: lib/Python/acos_file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ def get_sounding_data(self, data_name, sounding_id=None, indexes=None, return_in
431431
if dataset_obj == None or hasattr(dataset_obj, "__iter__") and len(dataset_obj) == 0:
432432
raise ValueError("No datasets matched data name: %s" % data_name)
433433
elif (hasattr(dataset_obj, "__iter__") and not hasattr(dataset_obj, "shape")) and len(dataset_obj) > 1:
434-
raise ValueError("Data name: %s matches too many datasets: %s" % (data_name, [ o.name for o in dataset_obj]))
434+
raise ValueError("Data name: %s matches too many datasets: %s" % (data_name, [ o for o in dataset_obj]))
435435
elif hasattr(dataset_obj, "__iter__") and not hasattr(dataset_obj, "shape"):
436436
dataset_obj = dataset_obj[0]
437437

Diff for: lib/Python/acos_file_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_gosat_file():
2828
gosat_obj._data_shape_name_dict = {}
2929
gosat_obj._default_shape_names = None
3030

31-
assert gosat_obj.get_data_shape('/FootprintGeometry/footprint_stokes_coefficients') == [b'Exposure', b'Band', b'Polarization', b'StokesCoefficient']
31+
assert gosat_obj.get_data_shape('/FootprintGeometry/footprint_stokes_coefficients') == ['Exposure', 'Band', 'Polarization', 'StokesCoefficient']
3232

3333
sounding_id = 20090725015225
3434
read_latitude = gosat_obj.get_sounding_info('sounding_latitude', sounding_id)

Diff for: lib/Python/l2_input.py

+50-7
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def extract_whitespace(self, value):
5252

5353
return [value, frontspace, endspace]
5454

55-
def __init__(self, nodeType, leaf=None, children=None):
55+
def __init__(self, nodeType, leaf=None, children=None, value_list=None):
56+
self.value_list = value_list
5657
self.type = nodeType
5758
if children:
5859
self.children = children
@@ -319,7 +320,18 @@ def get_matrix_data(self):
319320
for child in self.children:
320321
if child.type == 'matrix':
321322
matrixValues = []
322-
323+
# Faster handling if we pulled the values out (e.g.
324+
# for _AsciiParser)
325+
if(child.value_list is not None):
326+
mval = child.value_list[int(child.children[0][0].leaf)]
327+
# Each line is a row, and each entry on a row is a column
328+
res = [i.split() for i in mval.split("\n")]
329+
# Now flatten this, so a 1 column matrix gets returned as a
330+
# vector
331+
res = [i[0] if len(i) == 1 else i for i in res
332+
if len(i) != 0]
333+
return res
334+
# Drop down to slower handling (e.g., for _XmlParser)
323335
for rowNode in child.children:
324336
rowValues = [ x.leaf for x in rowNode ]
325337

@@ -329,7 +341,6 @@ def get_matrix_data(self):
329341
matrixValues.append(rowValues)
330342

331343
return matrixValues
332-
333344
return None
334345

335346
def set_matrix_data(self, newData):
@@ -385,8 +396,9 @@ def del_child(self, deadChild):
385396

386397
class _AsciiParser(PlyParser):
387398

388-
def __init__(self, contents='', **kw):
399+
def __init__(self, value_list=None, contents='', **kw):
389400
'Initialize the class'
401+
self.value_list=value_list
390402
PlyParser.__init__(self, **kw)
391403

392404
##############
@@ -542,8 +554,8 @@ def p_section_contents_value_list(self, p):
542554
# matrix row
543555
if len(matrixData[rowIndex]) == 0:
544556
del(matrixData[rowIndex])
545-
546-
p[0] = [ _Node('matrix', children=matrixData) ]
557+
p[0] = [ _Node('matrix', children=matrixData,
558+
value_list=self.value_list) ]
547559

548560
def p_section_contents_empty(self, p):
549561
'section_contents : empty'
@@ -782,10 +794,41 @@ def read(self, file_input):
782794
is_xml = True
783795
break
784796

797+
# The _AsciiParser can be extremely slow for large files.
798+
# We pull out any values to handle separately. We have a array
799+
# value_list that contains the contents of the value, we replace
800+
# this in fileContentsShort with the index number for this. So
801+
# _AsciiParser never actually sees the potentially long list of
802+
# values, which allows it to run much faster. We translate the
803+
# value back to the contents when we eventually look up the value.
804+
value_list = None
805+
if(not is_xml):
806+
fileContentsShort = ""
807+
value_list = []
808+
in_value = False
809+
for t in re.split('(begin|end)\s+values', fileContents, flags=re.IGNORECASE):
810+
if(t.lower() == "begin"):
811+
if(in_value):
812+
raise RuntimeError("Confused processing %s" % self.filename)
813+
in_value = True
814+
fileContentsShort += "begin VALUES\n"
815+
fileContentsShort += "%d\n" % len(value_list)
816+
value_list.append("")
817+
elif(t.lower() == "end"):
818+
if(not in_value):
819+
raise RuntimeError("Confused processing %s" % self.filename)
820+
in_value = False
821+
fileContentsShort += "end VALUES\n"
822+
else:
823+
if(in_value):
824+
value_list[-1] = t
825+
else:
826+
fileContentsShort += t
827+
785828
if (is_xml):
786829
self.rootNode = _XmlParser().parse(fileContents)
787830
else:
788-
self.rootNode = _AsciiParser(filename=self.filename).parse(fileContents)
831+
self.rootNode = _AsciiParser(value_list=value_list,filename=self.filename).parse(fileContentsShort)
789832

790833

791834
def write(self, file_output=None, doIndent=False):

Diff for: lib/Python/populator_base.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def __init__(self, **user_settings):
6767
# to create a l2/l1 aggregate. This is a useful thing for all the ACOS
6868
# like populators, but doesn't make any sense for FTS.
6969
self.have_l1b1 = True
70+
self._l2_input_file_cache = {}
7071

7172
# This will get filled in by a map of type name and class to handle
7273
# it, e.g., populator_list["oco"] = OcoPopulator. We fill this in as each
@@ -193,6 +194,13 @@ def create_populator_from_config_type(config_type, **user_settings):
193194
else:
194195
return None
195196

197+
def _l2_input_file(self, file):
198+
'''This caches reading a config file, so we don't parse the same file
199+
multiple_times.'''
200+
if(file not in self._l2_input_file_cache):
201+
self._l2_input_file_cache[file] = L2InputFile(file)
202+
return self._l2_input_file_cache[file]
203+
196204
@staticmethod
197205
def create_populator_from_config_file(config_file, **user_settings):
198206
'''Read the L2 input configuration file supplied, and based on the
@@ -347,8 +355,7 @@ def __get_list_file_values(self, listLocation, listName, sectionName=None, direc
347355
listFileObj.close()
348356
else:
349357
self.logger.debug('Loading LIST %s section as %s contents from file: %s' % (sectionName, listName, listFile))
350-
fileObj = L2InputFile(listFile)
351-
358+
fileObj = self._l2_input_file(listFile)
352359
sectNameParts = sectionName.split('->')
353360

354361
foundSects = fileObj.get_section('->'.join(sectNameParts[0:-1]) + '->LIST')
@@ -359,7 +366,6 @@ def __get_list_file_values(self, listLocation, listName, sectionName=None, direc
359366
if currListName != None and currListName == sectNameParts[-1]:
360367
fileListSect = currFileSect.get_section('LIST->VALUES')
361368
break
362-
363369
if fileListSect == None or len(fileListSect) == 0:
364370
raise IOError('Could not find section %s in file: %s' % (sectionName, listFile))
365371

@@ -377,33 +383,27 @@ def read_id_list_file(self, id_list_file, section=None):
377383

378384
if section != None:
379385
self.logger.debug('Reading id list from section %s file: %s' % (section, id_list_file))
380-
id_list_str = self.__get_list_file_values(id_list_file, str(id_list_file), section)
386+
id_list = self.__get_list_file_values(id_list_file, str(id_list_file), section)
381387
else:
382388
self.logger.debug('Reading id list from file: %s' % id_list_file)
383-
id_obj = L2InputFile(id_list_file)
384-
id_list_str = id_obj.get_matrix_data()
389+
# Quicker read for text only file
390+
id_list = open(id_list_file).read().split()
385391

386-
if id_list_str == None:
392+
if id_list == None:
387393
return []
388394

389-
id_list_long = []
390-
for curr_id_str in id_list_str:
391-
# Try to match sounding id pattern
392-
id_match = re.search('\d{3,17}\w?', curr_id_str)
393-
if not id_match:
394-
raise IOError('Could not find sounding id in string: "%s" in file %s' % (curr_id_str, id_list_file))
395-
beg_pos = int(id_match.start())
396-
end_pos = int(id_match.end())
397-
found_id = curr_id_str[beg_pos:end_pos]
398-
id_list_long.append( found_id )
399-
400-
return id_list_long
401-
395+
# Remove any white space
396+
id_list = [i.strip() for i in id_list]
397+
# Check for any bad data
398+
bad = [i for i in id_list if not re.match('\d{3,17}', i)]
399+
if(len(bad) > 0):
400+
raise IOError('Could not find sounding id in string: "%s" in file %s' % (bad[0], id_list_file))
401+
return id_list
402402

403403
def get_config_keyword_value(self, config_filename, keyword_path):
404404
'''Read a L2 input file as keyword/value pairs, and return the value
405405
for the given keyword'''
406-
config_obj = L2InputFile(config_filename)
406+
config_obj = self._l2_input_file(config_filename)
407407

408408
search_sect_name = '->'.join(keyword_path.split('/')[0:-1])
409409
search_key_name = keyword_path.split('/')[-1]

Diff for: lib/Python/populator_base_test.py

+22
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,25 @@ def test_get_config_keyword_value():
9090
assert (val ==
9191
"/data/smyth/Level2/test/tccon_small_set/acos_L1bB2900_tccon_5_good_qual.h5")
9292

93+
94+
def test_read_id_list_file_large():
95+
'''Test the reading the ID list for a large file. Historically this has
96+
been really slow, so we have a test in place here to check the speed of
97+
this.'''
98+
pb = PopulatorBase()
99+
id_list = pb.read_id_list_file(test_data + "large_sounding_ids.list")
100+
assert len(id_list) == 199158
101+
102+
def test_read_id_list_config_large():
103+
'''Test the reading the ID list for a large file. Historically this has
104+
been really slow, so we have a test in place here to check the speed of
105+
this.
106+
107+
This checks a file that uses our old ASCII format, rather than the
108+
simpler list of soundings'''
109+
pb = PopulatorBase()
110+
id_list = pb.read_id_list_file(test_data + "large.config",
111+
"input/OCOFullPhysics/SoundingIds")
112+
assert len(id_list) == 199158
113+
114+

0 commit comments

Comments
 (0)