From 1f957175b01588377b43b5843efddc82c597896d Mon Sep 17 00:00:00 2001 From: Rafael Steil Date: Mon, 17 May 2021 19:08:53 -0400 Subject: [PATCH] Add a log parser class --- src/plotman/_tests/log_parser_test.py | 54 +++++++ .../2021-04-04T19_00_47.681088-0400.log | 1 + src/plotman/log_parser.py | 140 ++++++++++++++++++ src/plotman/plotinfo.py | 108 ++++++++++++++ 4 files changed, 303 insertions(+) create mode 100644 src/plotman/_tests/log_parser_test.py create mode 100644 src/plotman/log_parser.py create mode 100644 src/plotman/plotinfo.py diff --git a/src/plotman/_tests/log_parser_test.py b/src/plotman/_tests/log_parser_test.py new file mode 100644 index 00000000..31e6c417 --- /dev/null +++ b/src/plotman/_tests/log_parser_test.py @@ -0,0 +1,54 @@ +import importlib.resources + +from plotman._tests import resources +from plotman.log_parser import PlotLogParser + +def test_should_correctly_parse(): + with importlib.resources.path(resources, "2021-04-04T19_00_47.681088-0400.log") as p: + logfile_path = p + + parser = PlotLogParser() + info = parser.parse(logfile_path) + + assert info.plot_id == "3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24" + + assert info.plot_size == 32 + assert info.started_at == "Sun Apr 4 19:00:50 2021" + assert info.buffer == 4000 + assert info.buckets == 128 + assert info.threads == 4 + + assert info.tmp_dir1 == "/farm/yards/901" + assert info.tmp_dir2 == "/farm/yards/901" + + assert info.phase1_duration_raw == 17571.981 + assert info.phase1_duration == 17572 + assert info.phase1_duration_minutes == 293 + assert info.phase1_duration_hours == 4.88 + + assert info.phase2_duration_raw == 6911.621 + assert info.phase2_duration == 6912 + assert info.phase2_duration_minutes == 115 + assert info.phase2_duration_hours == 1.92 + + assert info.phase3_duration_raw == 14537.188 + assert info.phase3_duration == 14537 + assert info.phase3_duration_minutes == 242 + assert info.phase3_duration_hours == 4.04 + + assert info.phase4_duration_raw == 924.288 + assert info.phase4_duration == 924 + assert info.phase4_duration_minutes == 15 + assert info.phase4_duration_hours == 0.26 + + assert info.total_time_raw == 39945.080 + assert info.total_time == 39945 + assert info.total_time_minutes == 666 + assert info.total_time_hours == 11.10 + + assert info.copy_time_raw == 501.696 + assert info.copy_time == 502 + assert info.copy_time_minutes == 8 + assert info.copy_time_hours == 0.14 + + assert info.filename == "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot" \ No newline at end of file diff --git a/src/plotman/_tests/resources/2021-04-04T19_00_47.681088-0400.log b/src/plotman/_tests/resources/2021-04-04T19_00_47.681088-0400.log index 704efe51..ce11a343 100644 --- a/src/plotman/_tests/resources/2021-04-04T19_00_47.681088-0400.log +++ b/src/plotman/_tests/resources/2021-04-04T19_00_47.681088-0400.log @@ -2081,6 +2081,7 @@ Approximate working space used (without final file): 286.598 GiB Final File size: 101.336 GiB Total time = 39945.080 seconds. CPU (123.100%) Mon Apr 5 06:06:35 2021 Copied final file from "/farm/yards/901/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp" to "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp" +Copy time = 501.696 seconds. CPU (23.860%) Sun May 9 22:52:41 2021 Removed temp2 file "/farm/yards/901/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp"? 1 Renamed final file from "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp" to "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot" 06:22:40.715 src.plotting.create_plots : INFO  Summary: diff --git a/src/plotman/log_parser.py b/src/plotman/log_parser.py new file mode 100644 index 00000000..5dbfcfe0 --- /dev/null +++ b/src/plotman/log_parser.py @@ -0,0 +1,140 @@ +import re +from plotman.plotinfo import PlotInfo + +class PlotLogParser: + """Parser for a finished plotting job""" + + def parse(self, filename: str) -> PlotInfo: + """Parses a single log and returns its info""" + entry = PlotInfo() + + matchers = [ + self.ignore_line, + self.plot_id, + self.plot_start_date, + self.plot_size, + self.buffer_size, + self.buckets, + self.threads, + self.plot_dirs, + self.phase1_duration, + self.phase2_duration, + self.phase3_duration, + self.phase4_duration, + self.total_time, + self.copy_time, + self.filename + ] + + with open(filename, 'r') as f: + for line in f: + for matcher in matchers: + if (matcher(line, entry)): + break + + return entry + + # ID: 3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24 + def plot_id(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r'^ID: (.+)$', line) + if m: + entry.plot_id = m.group(1) + return m != None + + # Renamed final file from "/farm/wagons/801/abc.plot.2.tmp" to "/farm/wagons/801/abc.plot" + def filename(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r'^Renamed final file from ".+" to "(.+)"', line) + if m: + entry.filename = m.group(1) + return m != None + + # Time for phase 1 = 17571.981 seconds. CPU (178.600%) Sun Apr 4 23:53:42 2021 + def phase1_duration(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Time for phase 1 = (\d+\.\d+) seconds", line) + if m: + entry.phase1_duration_raw = float(m.group(1)) + return m != None + + # Time for phase 2 = 6911.621 seconds. CPU (71.780%) Mon Apr 5 01:48:54 2021 + def phase2_duration(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Time for phase 2 = (\d+\.\d+) seconds", line) + if m: + entry.phase2_duration_raw = float(m.group(1)) + return m != None + + # Time for phase 3 = 14537.188 seconds. CPU (82.730%) Mon Apr 5 05:51:11 2021 + def phase3_duration(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Time for phase 3 = (\d+\.\d+) seconds", line) + if m: + entry.phase3_duration_raw = float(m.group(1)) + return m != None + + # Time for phase 4 = 924.288 seconds. CPU (86.810%) Mon Apr 5 06:06:35 2021 + def phase4_duration(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Time for phase 4 = (\d+\.\d+) seconds", line) + if m: + entry.phase4_duration_raw = float(m.group(1)) + return m != None + + # Total time = 39945.080 seconds. CPU (123.100%) Mon Apr 5 06:06:35 2021 + def total_time(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Total time = (\d+\.\d+) seconds", line) + if m: + entry.total_time_raw = float(m.group(1)) + return m != None + + # Copy time = 501.696 seconds. CPU (23.860%) Sun May 9 22:52:41 2021 + def copy_time(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Copy time = (\d+\.\d+) seconds", line) + if m: + entry.copy_time_raw = float(m.group(1)) + return m != None + + # Starting plotting progress into temporary dirs: /farm/yards/901 and /farm/yards/901 + def plot_dirs(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Starting plotting progress into temporary dirs: (.+) and (.+)$", line) + if m: + entry.tmp_dir1 = m.group(1) + entry.tmp_dir2 = m.group(2) + return m != None + + # Using 4 threads of stripe size 65536 + def threads(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Using (\d+) threads of stripe size (\d+)", line) + if m: + entry.threads = int(m.group(1)) + return m != None + + # "^Using (\\d+) buckets" + def buckets(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Using (\d+) buckets", line) + if m: + entry.buckets = int(m.group(1)) + return m != None + + # Buffer size is: 4000MiB + def buffer_size(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r"^Buffer size is: (\d+)MiB", line) + if m: + entry.buffer = int(m.group(1)) + return m != None + + # Plot size is: 32 + def plot_size(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r'^Plot size is: (\d+)', line) + if m: + entry.plot_size = int(m.group(1)) + return m != None + + # Starting phase 1/4: Forward Propagation into tmp files... Sun May 9 17:36:12 2021 + def plot_start_date(self, line: str, entry: PlotInfo) -> bool: + m = re.search(r'^Starting phase 1/4: Forward Propagation into tmp files\.\.\. (.+)', line) + if m: + entry.started_at = m.group(1) + return m != None + + + # Ignore lines starting with Bucket + # Bucket 0 uniform sort. Ram: 3.250GiB, u_sort min: 0.563GiB, qs min: 0.281GiB. + def ignore_line(self, line: str, _: PlotInfo) -> bool: + return re.search(r'^\tBucket', line) \ No newline at end of file diff --git a/src/plotman/plotinfo.py b/src/plotman/plotinfo.py new file mode 100644 index 00000000..8b56c02c --- /dev/null +++ b/src/plotman/plotinfo.py @@ -0,0 +1,108 @@ +import attr + +@attr.s(repr=True, init=False) +class PlotInfo: + """Represents the results of a finished plot job""" + started_at: str = "" + plot_id: str = "" + buckets: int = 0 + threads: int = 0 + buffer: int = 0 + plot_size: int = 0 + tmp_dir1: str = "" + tmp_dir2: str = "" + phase1_duration_raw: float = 0 + phase2_duration_raw: float = 0 + phase3_duration_raw: float = 0 + phase4_duration_raw: float = 0 + total_time_raw: float = 0 + copy_time_raw: float = 0 + filename: str = "" + + def is_empty(self) -> bool: + "Data is considered empty if total_time is zero" + return self.total_time == 0 + + # Phase 1 duration + @property + def phase1_duration(self) -> int: + return round(self.phase1_duration_raw) + + @property + def phase1_duration_minutes(self) -> int: + return self.duration_to_minutes(self.phase1_duration_raw) + + @property + def phase1_duration_hours(self) -> float: + return self.duration_to_hours(self.phase1_duration_raw) + + # Phase 2 duration + @property + def phase2_duration(self) -> int: + return round(self.phase2_duration_raw) + + @property + def phase2_duration_minutes(self) -> int: + return self.duration_to_minutes(self.phase2_duration_raw) + + @property + def phase2_duration_hours(self) -> float: + return self.duration_to_hours(self.phase2_duration_raw) + + # Phase 3 duration + @property + def phase3_duration(self) -> int: + return round(self.phase3_duration_raw) + + @property + def phase3_duration_minutes(self) -> int: + return self.duration_to_minutes(self.phase3_duration_raw) + + @property + def phase3_duration_hours(self) -> float: + return self.duration_to_hours(self.phase3_duration_raw) + + # Phase 4 duration + @property + def phase4_duration(self) -> int: + return round(self.phase4_duration_raw) + + @property + def phase4_duration_minutes(self) -> int: + return self.duration_to_minutes(self.phase4_duration_raw) + + @property + def phase4_duration_hours(self) -> float: + return self.duration_to_hours(self.phase4_duration_raw) + + # Total time + @property + def total_time(self) -> int: + return round(self.total_time_raw) + + @property + def total_time_minutes(self) -> int: + return self.duration_to_minutes(self.total_time_raw) + + @property + def total_time_hours(self) -> float: + return self.duration_to_hours(self.total_time_raw) + + # Copy time + @property + def copy_time(self) -> int: + return round(self.copy_time_raw) + + @property + def copy_time_minutes(self) -> int: + return self.duration_to_minutes(self.copy_time_raw) + + @property + def copy_time_hours(self) -> float: + return self.duration_to_hours(self.copy_time_raw) + + def duration_to_minutes(self, duration): + return round(duration / 60) + + def duration_to_hours(self, duration): + return round(duration / 60 / 60, 2)