Merge branch 'master' of github.jpl.nasa.gov:OCO/RtRetrievalFramework

mcduffie · mcduffie · commit e97107a58a48 · 2017-01-23T19:45:43.000Z
diff --git a/lib/Python/acos_file.py b/lib/Python/acos_file.py
@@ -431,7 +431,7 @@ def get_sounding_data(self, data_name, sounding_id=None, indexes=None, return_in
         if dataset_obj == None or hasattr(dataset_obj, "__iter__") and len(dataset_obj) == 0:
             raise ValueError("No datasets matched data name: %s" % data_name)
         elif (hasattr(dataset_obj, "__iter__") and not hasattr(dataset_obj, "shape")) and len(dataset_obj) > 1:
-            raise ValueError("Data name: %s matches too many datasets: %s" % (data_name, [ o.name for o in dataset_obj]))
+            raise ValueError("Data name: %s matches too many datasets: %s" % (data_name, [ o for o in dataset_obj]))
         elif hasattr(dataset_obj, "__iter__") and not hasattr(dataset_obj, "shape"):
             dataset_obj = dataset_obj[0]
 
diff --git a/lib/Python/acos_file_test.py b/lib/Python/acos_file_test.py
@@ -28,7 +28,7 @@ def test_gosat_file():
     gosat_obj._data_shape_name_dict = {}
     gosat_obj._default_shape_names = None
     
-    assert gosat_obj.get_data_shape('/FootprintGeometry/footprint_stokes_coefficients') == [b'Exposure', b'Band', b'Polarization', b'StokesCoefficient']
+    assert gosat_obj.get_data_shape('/FootprintGeometry/footprint_stokes_coefficients') == ['Exposure', 'Band', 'Polarization', 'StokesCoefficient']
 
     sounding_id = 20090725015225
     read_latitude = gosat_obj.get_sounding_info('sounding_latitude', sounding_id)
diff --git a/lib/Python/l2_input.py b/lib/Python/l2_input.py
@@ -52,7 +52,8 @@ def extract_whitespace(self, value):
 
         return [value, frontspace, endspace]
 
-    def __init__(self, nodeType, leaf=None, children=None):
+    def __init__(self, nodeType, leaf=None, children=None, value_list=None):
+         self.value_list = value_list
          self.type = nodeType
          if children:
               self.children = children
@@ -319,7 +320,18 @@ def get_matrix_data(self):
         for child in self.children:
             if child.type == 'matrix':
                 matrixValues = []
-
+                # Faster handling if we pulled the values out (e.g.
+                # for _AsciiParser)
+                if(child.value_list is not None):
+                    mval = child.value_list[int(child.children[0][0].leaf)]
+                    # Each line is a row, and each entry on a row is a column
+                    res = [i.split() for i in mval.split("\n")]
+                    # Now flatten this, so a 1 column matrix gets returned as a
+                    # vector
+                    res = [i[0] if len(i) == 1 else i for i in res
+                           if len(i) != 0]
+                    return res
+                # Drop down to slower handling (e.g., for _XmlParser)
                 for rowNode in child.children:
                     rowValues = [ x.leaf for x in rowNode ]
 
@@ -329,7 +341,6 @@ def get_matrix_data(self):
                         matrixValues.append(rowValues)
 
                 return matrixValues
-            
         return None
 
     def set_matrix_data(self, newData):
@@ -385,8 +396,9 @@ def del_child(self, deadChild):
 
 class _AsciiParser(PlyParser):
 
-    def __init__(self, contents='', **kw):
+    def __init__(self, value_list=None, contents='', **kw):
         'Initialize the class'
+        self.value_list=value_list
         PlyParser.__init__(self, **kw)
 
     ##############
@@ -542,8 +554,8 @@ def p_section_contents_value_list(self, p):
         # matrix row
         if len(matrixData[rowIndex]) == 0:
             del(matrixData[rowIndex])
-
-        p[0] = [ _Node('matrix', children=matrixData) ]
+        p[0] = [ _Node('matrix', children=matrixData,
+                       value_list=self.value_list) ]
 
     def p_section_contents_empty(self, p):
         'section_contents : empty'
@@ -782,10 +794,41 @@ def read(self, file_input):
                         is_xml = True
                     break
 
+            # The _AsciiParser can be extremely slow for large files.
+            # We pull out any values to handle separately. We have a array
+            # value_list that contains the contents of the value, we replace
+            # this in fileContentsShort with the index number for this. So
+            # _AsciiParser never actually sees the potentially long list of
+            # values, which allows it to run much faster. We translate the
+            # value back to the contents when we eventually look up the value.
+            value_list = None
+            if(not is_xml):
+                fileContentsShort = ""
+                value_list = []
+                in_value = False
+                for t in re.split('(begin|end)\s+values', fileContents, flags=re.IGNORECASE):
+                    if(t.lower() == "begin"):
+                        if(in_value):
+                            raise RuntimeError("Confused processing %s" % self.filename)
+                        in_value = True
+                        fileContentsShort += "begin VALUES\n"
+                        fileContentsShort += "%d\n" % len(value_list)
+                        value_list.append("")
+                    elif(t.lower() == "end"):
+                        if(not in_value):
+                            raise RuntimeError("Confused processing %s" % self.filename)
+                        in_value = False
+                        fileContentsShort += "end VALUES\n"
+                    else:
+                        if(in_value):
+                            value_list[-1] = t
+                        else:
+                            fileContentsShort += t
+
             if (is_xml):
                 self.rootNode = _XmlParser().parse(fileContents)
             else:
-                self.rootNode = _AsciiParser(filename=self.filename).parse(fileContents)
+                self.rootNode = _AsciiParser(value_list=value_list,filename=self.filename).parse(fileContentsShort)
 
 
     def write(self, file_output=None, doIndent=False):
diff --git a/lib/Python/populator_base.py b/lib/Python/populator_base.py
@@ -67,6 +67,7 @@ def __init__(self, **user_settings):
         # to create a l2/l1 aggregate. This is a useful thing for all the ACOS
         # like populators, but doesn't make any sense for FTS.
         self.have_l1b1 = True
+        self._l2_input_file_cache = {}
 
     # This will get filled in by a map of type name and class to handle
     # it, e.g., populator_list["oco"] = OcoPopulator. We fill this in as each
@@ -193,6 +194,13 @@ def create_populator_from_config_type(config_type, **user_settings):
         else:
             return None
 
+    def _l2_input_file(self, file):
+        '''This caches reading a config file, so we don't parse the same file 
+        multiple_times.'''
+        if(file not in self._l2_input_file_cache):
+            self._l2_input_file_cache[file] = L2InputFile(file)
+        return self._l2_input_file_cache[file]
+    
     @staticmethod
     def create_populator_from_config_file(config_file, **user_settings):
         '''Read the L2 input configuration file supplied, and based on the
@@ -347,8 +355,7 @@ def __get_list_file_values(self, listLocation, listName, sectionName=None, direc
                     listFileObj.close()
             else:
                 self.logger.debug('Loading LIST %s section as %s contents from file: %s' % (sectionName, listName, listFile))
-                fileObj = L2InputFile(listFile)
-
+                fileObj = self._l2_input_file(listFile)
                 sectNameParts = sectionName.split('->')
 
                 foundSects = fileObj.get_section('->'.join(sectNameParts[0:-1]) + '->LIST')
@@ -359,7 +366,6 @@ def __get_list_file_values(self, listLocation, listName, sectionName=None, direc
                     if currListName != None and currListName == sectNameParts[-1]:
                         fileListSect = currFileSect.get_section('LIST->VALUES')
                         break
-
                 if fileListSect == None or len(fileListSect) == 0:
                     raise IOError('Could not find section %s in file: %s' % (sectionName, listFile))
 
@@ -377,33 +383,27 @@ def read_id_list_file(self, id_list_file, section=None):
 
         if section != None:
             self.logger.debug('Reading id list from section %s file: %s' % (section, id_list_file))
-            id_list_str = self.__get_list_file_values(id_list_file, str(id_list_file), section)
+            id_list = self.__get_list_file_values(id_list_file, str(id_list_file), section)
         else:
             self.logger.debug('Reading id list from file: %s' % id_list_file)
-            id_obj = L2InputFile(id_list_file)                                   
-            id_list_str = id_obj.get_matrix_data()
+            # Quicker read for text only file
+            id_list = open(id_list_file).read().split()
 
-        if id_list_str == None:
+        if id_list == None:
             return []
 
-        id_list_long = []
-        for curr_id_str in id_list_str:
-            # Try to match sounding id pattern
-            id_match = re.search('\d{3,17}\w?', curr_id_str)
-            if not id_match:
-                raise IOError('Could not find sounding id in string: "%s" in file %s' % (curr_id_str, id_list_file))
-            beg_pos = int(id_match.start())
-            end_pos = int(id_match.end())
-            found_id = curr_id_str[beg_pos:end_pos]
-            id_list_long.append( found_id )
-
-        return id_list_long
-
+        # Remove any white space
+        id_list = [i.strip() for i in id_list]
+        # Check for any bad data
+        bad = [i for i in id_list if not re.match('\d{3,17}', i)]
+        if(len(bad) > 0):
+            raise IOError('Could not find sounding id in string: "%s" in file %s' % (bad[0], id_list_file))
+        return id_list
 
     def get_config_keyword_value(self, config_filename, keyword_path):
         '''Read a L2 input file as keyword/value pairs, and return the value
         for the given keyword'''
-        config_obj = L2InputFile(config_filename)
+        config_obj = self._l2_input_file(config_filename)
 
         search_sect_name = '->'.join(keyword_path.split('/')[0:-1])
         search_key_name  = keyword_path.split('/')[-1]
diff --git a/lib/Python/populator_base_test.py b/lib/Python/populator_base_test.py
@@ -90,3 +90,25 @@ def test_get_config_keyword_value():
     assert (val ==
  "/data/smyth/Level2/test/tccon_small_set/acos_L1bB2900_tccon_5_good_qual.h5")
 
+
+def test_read_id_list_file_large():
+    '''Test the reading the ID list for a large file. Historically this has
+    been really slow, so we have a test in place here to check the speed of 
+    this.'''
+    pb = PopulatorBase()
+    id_list = pb.read_id_list_file(test_data + "large_sounding_ids.list")
+    assert len(id_list) == 199158
+    
+def test_read_id_list_config_large():
+    '''Test the reading the ID list for a large file. Historically this has
+    been really slow, so we have a test in place here to check the speed of 
+    this. 
+
+    This checks a file that uses our old ASCII format, rather than the 
+    simpler list of soundings'''
+    pb = PopulatorBase()
+    id_list = pb.read_id_list_file(test_data + "large.config",
+                                   "input/OCOFullPhysics/SoundingIds")
+    assert len(id_list) == 199158
+    
+
diff --git a/unit_test_data/large.config b/unit_test_data/large.config
diff --git a/unit_test_data/large_sounding_ids.list b/unit_test_data/large_sounding_ids.list