Various changes to utils.py, e.g. to optimize for larger log files (> 5 MB) on S3

Martin · Martin · commit 6379437844d9 · 2021-06-23T21:23:01.000+02:00
diff --git a/examples/data-processing/LOG_TP/94C49784/00000005/00000002.MF4 b/examples/data-processing/LOG_TP/94C49784/00000005/00000002.MF4
diff --git a/examples/data-processing/README.md b/examples/data-processing/README.md
@@ -22,7 +22,9 @@ Download this folder, enter it, open your command prompt and run below:
 ### Regarding local disk vs S3
 The examples load data from local disk by default. If you want to load data from your S3 server, modify `devices` to include a list of S3 device paths (e.g. `"my_bucket/device_id"`). In addition, you'll modify the `fs` initialization to include your S3 details as below:
 
+```
 fs = setup_fs(s3=True, key="access_key", secret="secret_key", endpoint="endpoint")
+```
 
 If you're using AWS S3, your endpoint would e.g. be `https://s3.us-east-2.amazonaws.com` (if your region is `us-east-2`). A MinIO S3 endpoint would e.g. be `http://192.168.0.1:9000`.
 
diff --git a/examples/data-processing/process_data.py b/examples/data-processing/process_data.py
@@ -39,6 +39,6 @@ def ratio(s1, s2):
 
 # --------------------------------------------
 # example: resample and restructure data (parameters in columns)
-df_phys_join = restructure_data(df_phys=df_phys_all, res="1S")
+df_phys_join = restructure_data(df_phys=df_phys_all, res="1S", full_col_names=True)
 df_phys_join.to_csv("output_joined.csv")
 print(df_phys_join)
diff --git a/examples/data-processing/process_tp_data.py b/examples/data-processing/process_tp_data.py
@@ -23,7 +23,7 @@ def process_tp_example(devices, dbc_path, tp_type):
         df_raw.to_csv(f"{output_folder}/tp_raw_data_combined.csv")
 
         # extract physical values as normal, but add tp_type
-        df_phys = proc.extract_phys(df_raw, tp_type=tp_type)
+        df_phys = proc.extract_phys(df_raw)
         df_phys.to_csv(f"{output_folder}/tp_physical_values.csv")
 
     print("Finished saving CSV output for devices:", devices)
@@ -43,9 +43,9 @@ def process_tp_example(devices, dbc_path, tp_type):
 process_tp_example(devices, dbc_paths, "j1939")
 
 # NMEA 2000 fast packet data (with GNSS position)
-# devices = ["LOG_TP/94C49784"]
-# dbc_paths = [r"dbc_files/tp_nmea_2.dbc"]
-# process_tp_example(devices, dbc_paths, "nmea")
+devices = ["LOG_TP/94C49784"]
+dbc_paths = [r"dbc_files/tp_nmea_2.dbc"]
+process_tp_example(devices, dbc_paths, "nmea")
 
 # UDS data across two CAN channels
 devices = ["LOG_TP/FE34E37D"]
diff --git a/examples/data-processing/requirements.txt b/examples/data-processing/requirements.txt
@@ -29,4 +29,5 @@ typing-extensions==3.10.0.0
 urllib3==1.26.5
 wrapt==1.12.1
 yarl==1.6.3
-zipp==3.4.1
+zipp==3.4.1
+j1939_pgn==0.4
diff --git a/examples/data-processing/utils.py b/examples/data-processing/utils.py
@@ -1,18 +1,26 @@
 def setup_fs(s3, key="", secret="", endpoint="", cert=""):
     """Given a boolean specifying whether to use local disk or S3, setup filesystem
     Syntax examples: AWS (http://s3.us-east-2.amazonaws.com), MinIO (http://192.168.0.1:9000)
-    The cert input is relevant if you're using MinIO with TLS enabled, for specifying the path to the certficiate
+    The cert input is relevant if you're using MinIO with TLS enabled, for specifying the path to the certficiate.
+
+    The block_size is set to accomodate files up to 55 MB in size. If your log files are larger, adjust this value accordingly
     """
 
     if s3:
         import s3fs
 
+        block_size = 55 * 1024 * 1024
+
         if "amazonaws" in endpoint:
-            fs = s3fs.S3FileSystem(key=key, secret=secret)
+            fs = s3fs.S3FileSystem(key=key, secret=secret, default_block_size=block_size)
         elif cert != "":
-            fs = s3fs.S3FileSystem(key=key, secret=secret, client_kwargs={"endpoint_url": endpoint, "verify": cert})
+            fs = s3fs.S3FileSystem(
+                key=key, secret=secret, client_kwargs={"endpoint_url": endpoint, "verify": cert}, default_block_size=block_size
+            )
         else:
-            fs = s3fs.S3FileSystem(key=key, secret=secret, client_kwargs={"endpoint_url": endpoint},)
+            fs = s3fs.S3FileSystem(
+                key=key, secret=secret, client_kwargs={"endpoint_url": endpoint}, default_block_size=block_size
+            )
 
     else:
         from pathlib import Path
@@ -52,37 +60,31 @@ def list_log_files(fs, devices, start_times, verbose=True):
         for idx, device in enumerate(devices):
             start = start_times[idx]
             log_files_device = canedge_browser.get_log_files(fs, [device], start_date=start)
-
-            # exclude the 1st log file if the last timestamp is before the start timestamp
-            if len(log_files_device) > 0:
-                with fs.open(log_files_device[0], "rb") as handle:
-                    mdf_file = mdf_iter.MdfFile(handle)
-                    df_raw_lin = mdf_file.get_data_frame_lin()
-                    df_raw_lin["IDE"] = 0
-                    df_raw_can = mdf_file.get_data_frame()
-                    df_raw = df_raw_can.append(df_raw_lin)
-                    end_time = df_raw.index[-1]
-
-                if end_time < start:
-                    log_files_device = log_files_device[1:]
-
-                log_files.extend(log_files_device)
+            log_files.extend(log_files_device)
 
     if verbose:
         print(f"Found {len(log_files)} log files\n")
 
     return log_files
 
 
-def restructure_data(df_phys, res, full_col_names=True):
+def restructure_data(df_phys, res, full_col_names=False, pgn_names=False):
     import pandas as pd
+    from J1939_PGN import J1939_PGN
 
     df_phys_join = pd.DataFrame({"TimeStamp": []})
     if not df_phys.empty:
         for message, df_phys_message in df_phys.groupby("CAN ID"):
             for signal, data in df_phys_message.groupby("Signal"):
-                if full_col_names == True:
-                    col_name = str(hex(int(message))).upper()[2:] + "." + str(signal)
+
+                pgn = J1939_PGN(int(message)).pgn
+
+                if full_col_names == True and pgn_names == False:
+                    col_name = str(hex(int(message))).upper()[2:] + "." + signal
+                elif full_col_names == True and pgn_names == True:
+                    col_name = str(hex(int(message))).upper()[2:] + "." + str(pgn) + "." + signal
+                elif full_col_names == False and pgn_names == True:
+                    col_name = str(pgn) + "." + signal
                 else:
                     col_name = signal
 
@@ -178,18 +180,23 @@ def filter_signals(self, df_phys):
 
         return df_phys
 
-    def get_raw_data(self, log_file):
-        """Extract a df of raw data and device ID from log file
+    def get_raw_data(self, log_file, lin=False):
+        """Extract a df of raw data and device ID from log file.
+        Optionally include LIN bus data by setting lin=True
         """
         import mdf_iter
 
         with self.fs.open(log_file, "rb") as handle:
             mdf_file = mdf_iter.MdfFile(handle)
             device_id = self.get_device_id(mdf_file)
-            df_raw_lin = mdf_file.get_data_frame_lin()
-            df_raw_lin["IDE"] = 0
-            df_raw_can = mdf_file.get_data_frame()
-            df_raw = df_raw_can.append(df_raw_lin)
+
+            if lin:
+                df_raw_lin = mdf_file.get_data_frame_lin()
+                df_raw_lin["IDE"] = 0
+                df_raw_can = mdf_file.get_data_frame()
+                df_raw = df_raw_can.append(df_raw_lin)
+            else:
+                df_raw = mdf_file.get_data_frame()
 
         return df_raw, device_id
 
@@ -342,7 +349,6 @@ def construct_new_tp_frame(self, base_frame, payload_concatenated, can_id):
 
     def combine_tp_frames(self, df_raw):
         import pandas as pd
-        import sys
 
         bam_pgn_hex = self.frame_struct["bam_pgn_hex"]
         res_id_list = [int(res_id, 16) for res_id in self.frame_struct["res_id_list_hex"]]