Skip to content

Commit

Permalink
Add a log parser class
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafael Steil authored and RafSteilShopify committed May 18, 2021
1 parent 5bff3dd commit 1f95717
Show file tree
Hide file tree
Showing 4 changed files with 303 additions and 0 deletions.
54 changes: 54 additions & 0 deletions src/plotman/_tests/log_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import importlib.resources

from plotman._tests import resources
from plotman.log_parser import PlotLogParser

def test_should_correctly_parse():
with importlib.resources.path(resources, "2021-04-04T19_00_47.681088-0400.log") as p:
logfile_path = p

parser = PlotLogParser()
info = parser.parse(logfile_path)

assert info.plot_id == "3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24"

assert info.plot_size == 32
assert info.started_at == "Sun Apr 4 19:00:50 2021"
assert info.buffer == 4000
assert info.buckets == 128
assert info.threads == 4

assert info.tmp_dir1 == "/farm/yards/901"
assert info.tmp_dir2 == "/farm/yards/901"

assert info.phase1_duration_raw == 17571.981
assert info.phase1_duration == 17572
assert info.phase1_duration_minutes == 293
assert info.phase1_duration_hours == 4.88

assert info.phase2_duration_raw == 6911.621
assert info.phase2_duration == 6912
assert info.phase2_duration_minutes == 115
assert info.phase2_duration_hours == 1.92

assert info.phase3_duration_raw == 14537.188
assert info.phase3_duration == 14537
assert info.phase3_duration_minutes == 242
assert info.phase3_duration_hours == 4.04

assert info.phase4_duration_raw == 924.288
assert info.phase4_duration == 924
assert info.phase4_duration_minutes == 15
assert info.phase4_duration_hours == 0.26

assert info.total_time_raw == 39945.080
assert info.total_time == 39945
assert info.total_time_minutes == 666
assert info.total_time_hours == 11.10

assert info.copy_time_raw == 501.696
assert info.copy_time == 502
assert info.copy_time_minutes == 8
assert info.copy_time_hours == 0.14

assert info.filename == "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot"
Original file line number Diff line number Diff line change
Expand Up @@ -2081,6 +2081,7 @@ Approximate working space used (without final file): 286.598 GiB
Final File size: 101.336 GiB
Total time = 39945.080 seconds. CPU (123.100%) Mon Apr 5 06:06:35 2021
Copied final file from "/farm/yards/901/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp" to "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp"
Copy time = 501.696 seconds. CPU (23.860%) Sun May 9 22:52:41 2021
Removed temp2 file "/farm/yards/901/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp"? 1
Renamed final file from "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot.2.tmp" to "/farm/wagons/801/plot-k32-2021-04-04-19-00-3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24.plot"
06:22:40.715 src.plotting.create_plots : INFO  Summary:
Expand Down
140 changes: 140 additions & 0 deletions src/plotman/log_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import re
from plotman.plotinfo import PlotInfo

class PlotLogParser:
"""Parser for a finished plotting job"""

def parse(self, filename: str) -> PlotInfo:
"""Parses a single log and returns its info"""
entry = PlotInfo()

matchers = [
self.ignore_line,
self.plot_id,
self.plot_start_date,
self.plot_size,
self.buffer_size,
self.buckets,
self.threads,
self.plot_dirs,
self.phase1_duration,
self.phase2_duration,
self.phase3_duration,
self.phase4_duration,
self.total_time,
self.copy_time,
self.filename
]

with open(filename, 'r') as f:
for line in f:
for matcher in matchers:
if (matcher(line, entry)):
break

return entry

# ID: 3eb8a37981de1cc76187a36ed947ab4307943cf92967a7e166841186c7899e24
def plot_id(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r'^ID: (.+)$', line)
if m:
entry.plot_id = m.group(1)
return m != None

# Renamed final file from "/farm/wagons/801/abc.plot.2.tmp" to "/farm/wagons/801/abc.plot"
def filename(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r'^Renamed final file from ".+" to "(.+)"', line)
if m:
entry.filename = m.group(1)
return m != None

# Time for phase 1 = 17571.981 seconds. CPU (178.600%) Sun Apr 4 23:53:42 2021
def phase1_duration(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Time for phase 1 = (\d+\.\d+) seconds", line)
if m:
entry.phase1_duration_raw = float(m.group(1))
return m != None

# Time for phase 2 = 6911.621 seconds. CPU (71.780%) Mon Apr 5 01:48:54 2021
def phase2_duration(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Time for phase 2 = (\d+\.\d+) seconds", line)
if m:
entry.phase2_duration_raw = float(m.group(1))
return m != None

# Time for phase 3 = 14537.188 seconds. CPU (82.730%) Mon Apr 5 05:51:11 2021
def phase3_duration(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Time for phase 3 = (\d+\.\d+) seconds", line)
if m:
entry.phase3_duration_raw = float(m.group(1))
return m != None

# Time for phase 4 = 924.288 seconds. CPU (86.810%) Mon Apr 5 06:06:35 2021
def phase4_duration(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Time for phase 4 = (\d+\.\d+) seconds", line)
if m:
entry.phase4_duration_raw = float(m.group(1))
return m != None

# Total time = 39945.080 seconds. CPU (123.100%) Mon Apr 5 06:06:35 2021
def total_time(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Total time = (\d+\.\d+) seconds", line)
if m:
entry.total_time_raw = float(m.group(1))
return m != None

# Copy time = 501.696 seconds. CPU (23.860%) Sun May 9 22:52:41 2021
def copy_time(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Copy time = (\d+\.\d+) seconds", line)
if m:
entry.copy_time_raw = float(m.group(1))
return m != None

# Starting plotting progress into temporary dirs: /farm/yards/901 and /farm/yards/901
def plot_dirs(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Starting plotting progress into temporary dirs: (.+) and (.+)$", line)
if m:
entry.tmp_dir1 = m.group(1)
entry.tmp_dir2 = m.group(2)
return m != None

# Using 4 threads of stripe size 65536
def threads(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Using (\d+) threads of stripe size (\d+)", line)
if m:
entry.threads = int(m.group(1))
return m != None

# "^Using (\\d+) buckets"
def buckets(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Using (\d+) buckets", line)
if m:
entry.buckets = int(m.group(1))
return m != None

# Buffer size is: 4000MiB
def buffer_size(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r"^Buffer size is: (\d+)MiB", line)
if m:
entry.buffer = int(m.group(1))
return m != None

# Plot size is: 32
def plot_size(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r'^Plot size is: (\d+)', line)
if m:
entry.plot_size = int(m.group(1))
return m != None

# Starting phase 1/4: Forward Propagation into tmp files... Sun May 9 17:36:12 2021
def plot_start_date(self, line: str, entry: PlotInfo) -> bool:
m = re.search(r'^Starting phase 1/4: Forward Propagation into tmp files\.\.\. (.+)', line)
if m:
entry.started_at = m.group(1)
return m != None


# Ignore lines starting with Bucket
# Bucket 0 uniform sort. Ram: 3.250GiB, u_sort min: 0.563GiB, qs min: 0.281GiB.
def ignore_line(self, line: str, _: PlotInfo) -> bool:
return re.search(r'^\tBucket', line)
108 changes: 108 additions & 0 deletions src/plotman/plotinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import attr

@attr.s(repr=True, init=False)
class PlotInfo:
"""Represents the results of a finished plot job"""
started_at: str = ""
plot_id: str = ""
buckets: int = 0
threads: int = 0
buffer: int = 0
plot_size: int = 0
tmp_dir1: str = ""
tmp_dir2: str = ""
phase1_duration_raw: float = 0
phase2_duration_raw: float = 0
phase3_duration_raw: float = 0
phase4_duration_raw: float = 0
total_time_raw: float = 0
copy_time_raw: float = 0
filename: str = ""

def is_empty(self) -> bool:
"Data is considered empty if total_time is zero"
return self.total_time == 0

# Phase 1 duration
@property
def phase1_duration(self) -> int:
return round(self.phase1_duration_raw)

@property
def phase1_duration_minutes(self) -> int:
return self.duration_to_minutes(self.phase1_duration_raw)

@property
def phase1_duration_hours(self) -> float:
return self.duration_to_hours(self.phase1_duration_raw)

# Phase 2 duration
@property
def phase2_duration(self) -> int:
return round(self.phase2_duration_raw)

@property
def phase2_duration_minutes(self) -> int:
return self.duration_to_minutes(self.phase2_duration_raw)

@property
def phase2_duration_hours(self) -> float:
return self.duration_to_hours(self.phase2_duration_raw)

# Phase 3 duration
@property
def phase3_duration(self) -> int:
return round(self.phase3_duration_raw)

@property
def phase3_duration_minutes(self) -> int:
return self.duration_to_minutes(self.phase3_duration_raw)

@property
def phase3_duration_hours(self) -> float:
return self.duration_to_hours(self.phase3_duration_raw)

# Phase 4 duration
@property
def phase4_duration(self) -> int:
return round(self.phase4_duration_raw)

@property
def phase4_duration_minutes(self) -> int:
return self.duration_to_minutes(self.phase4_duration_raw)

@property
def phase4_duration_hours(self) -> float:
return self.duration_to_hours(self.phase4_duration_raw)

# Total time
@property
def total_time(self) -> int:
return round(self.total_time_raw)

@property
def total_time_minutes(self) -> int:
return self.duration_to_minutes(self.total_time_raw)

@property
def total_time_hours(self) -> float:
return self.duration_to_hours(self.total_time_raw)

# Copy time
@property
def copy_time(self) -> int:
return round(self.copy_time_raw)

@property
def copy_time_minutes(self) -> int:
return self.duration_to_minutes(self.copy_time_raw)

@property
def copy_time_hours(self) -> float:
return self.duration_to_hours(self.copy_time_raw)

def duration_to_minutes(self, duration):
return round(duration / 60)

def duration_to_hours(self, duration):
return round(duration / 60 / 60, 2)

0 comments on commit 1f95717

Please sign in to comment.