Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analysis: added matplotlib figure generation with several desirable performance metrics. #612

Open
wants to merge 31 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
adc02c0
added figure file
nikwl May 21, 2021
49f2a17
integrated figfile contents into analyzer.py
nikwl May 21, 2021
4914ee2
passed logfile where I should have passed logdir
nikwl May 21, 2021
c0a75b5
default logdir and logfile should now be none
nikwl May 21, 2021
3eda2a1
the figfile condition to run new code was flipped
nikwl May 21, 2021
ba577f8
removed period for consistency
nikwl May 21, 2021
56d5363
forgot to add code that converts directory to list of files
nikwl May 21, 2021
bf7f02c
pyplot imported incorrectly
nikwl May 21, 2021
8d06393
fig file now passable for either log file or log dir
nikwl May 21, 2021
8458013
assertation prevents generating figure with too few datapoints
nikwl May 21, 2021
876724f
directory handling was passed figfile instead of logfilenames
nikwl May 21, 2021
866dbd3
fixed bug with cumulative plot
nikwl May 21, 2021
f6c958f
Revert "fix: avoid more missing process errors"
nikwl Jun 22, 2021
b106fea
baby's first merge
nikwl Jun 22, 2021
375a90e
updating fork
nikwl Jun 22, 2021
e909d1b
migrated graph
nikwl Jun 22, 2021
b89bc40
Merge branch 'ericaltendorf:development' into development
nikwl Aug 7, 2021
95e1c63
several fixes, added some cli arguments, should work now
nikwl Aug 7, 2021
be3871c
Merge branch 'development' into development
altendky Aug 28, 2021
f87c2ab
Merge branch 'development' into development
altendky Aug 28, 2021
46260ad
Fixed several discontinuities that I think were caused by the previou…
nikwl Aug 29, 2021
a6c65ed
logdir is no longer required, instead it pull from the logdir defined…
nikwl Aug 29, 2021
134d4b7
Added type annotations to functions
nikwl Aug 29, 2021
b478a14
Merge branch 'development' into nikwl/development
altendky Aug 29, 2021
a8039dd
black
altendky Aug 29, 2021
b55fb57
tidy
altendky Aug 29, 2021
2b356f5
Merge branch 'development' into nikwl_development
altendky Aug 29, 2021
37d6dc6
Merge pull request #1 from altendky/nikwl_development
nikwl Aug 29, 2021
1011b0f
Updated graph.py parser to new style. Reformatted graph.py with black.
nikwl Aug 29, 2021
3bc6d90
Update setup.cfg
altendky Aug 30, 2021
d15ec4c
[mypy-matplotlib] ignore_missing_imports = true
altendky Aug 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install_requires =
psutil ~= 5.8
pyyaml ~= 5.4
texttable ~= 1.6
matplotlib ~= 3.4.2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should probably be an extra down below for plotting graphing.

graph =
    matplotlib ~= 3.4
    numpy ~= 1.20


[options.packages.find]
where=src
Expand Down
162 changes: 161 additions & 1 deletion src/plotman/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,132 @@
import re
import statistics
import sys
import time, datetime

import texttable as tt
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from plotman import plot_util


def analyze(logfilenames, clipterminals, bytmp, bybitfield):
def create_ax_dumbbell(ax, data, max_stacked=50):
'''
Create a dumbbell plot of concurrent plot instances over time.
Parameters:
ax: a matplotlib axis.
data: numpy arrary with [start times, end times].
'''

def newline(p1, p2, color='r'):
l = matplotlib.lines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color=color)
ax.add_line(l)
return l

# Prevent the stack from growing to tall
num_rows = data.shape[0]
stacker = []
for _ in range(int(np.ceil(num_rows / float(max_stacked)))):
stacker.extend(list(range(max_stacked)))
stacker = np.array(stacker)
stacker = stacker[:-(max_stacked-int(num_rows % float(max_stacked)))]

for (p1, p2), i in zip(data[:,:2], stacker):
newline([p1, i], [p2, i])
ax.scatter(data[:,0], stacker, color='b')
ax.scatter(data[:,1], stacker, color='b')

ax.set_ylabel('Plots')
ax.set_xlim(np.min(data[:,0])-2, np.max(data[:,1])+2)


def create_ax_plotrate(ax, data, end=True, window=3):
'''
Create a plot showing the rate of plotting over time. Can be computed
with respect to the plot start (this is rate of plot creation) or
with respect to the plot end (this is rate of plot completion).
Parameters:
ax: a matplotlib axis.
data: numpy arrary with [start times, end times].
end: T/F, compute plot creation or plot completion rate.
window: Window to compute rate over.
'''

def estimate_rate(data, window):
rate_list = []
window_list = []
# This takes care of when we dont have a full window
for i in range(window):
rate_list.append(data[i] - data[0])
window_list.append(i)
# This takes care of when we do
for i in range(len(data) - window):
rate_list.append(data[i+window] - data[i])
window_list.append(window)
rate_list, window_list = np.array(rate_list), np.array(window_list)
rate_list[rate_list == 0] = np.nan # This prevents div by zero error
return np.where(np.logical_not(np.isnan(rate_list)), (window_list-1) / rate_list, 0)

# Estimate the rate of ending or the rate of starting
if end:
rate = estimate_rate(data[:,1], window)
ax.plot(data[:,1], rate)
else:
rate = estimate_rate(data[:,0], window)
ax.plot(data[:,0], rate)

ax.set_ylabel('Avg Plot Rate (plots/hour)')
ax.set_xlim(np.min(data[:,0])-2, np.max(data[:,1])+2)


def create_ax_plottime(ax, data, window=3):
'''
Create a plot showing the average time to create a single plot. This is
computed using a moving average. Note that the plot may not be
very accurate for the beginning and ending windows.
Parameters:
ax: a matplotlib axis.
data: numpy arrary with [start times, end times].
window: Window to compute rate over.
'''

# Compute moving avg
kernel = np.ones(window) / window
data_tiled = np.vstack((
np.expand_dims(data[:,1] - data[:,0], axis=1),
np.tile(data[-1,1] - data[-1,0], (window-1, 1))
))
rolling_avg = np.convolve(data_tiled.squeeze(), kernel, mode='valid')

ax.plot(data[:,1], rolling_avg)

ax.set_ylabel('Avg Plot Time (hours)')
ax.set_xlim(np.min(data[:,0])-2, np.max(data[:,1])+2)


def create_ax_plotcumulative(ax, data):
'''
Create a plot showing the cumulative number of plots over time.
Parameters:
ax: a matplotlib axis.
data: numpy arrary with [start times, end times].
'''
ax.plot(data[:,1], range(data.shape[0]))

ax.set_ylabel('Total plots (plots)')
ax.set_xlim(np.min(data[:,0])-2, np.max(data[:,1])+2)


def analyze(logfilenames, clipterminals, bytmp, bybitfield, figfile):
data = {}

# Get valid logfiles if we were passed a directory
if not isinstance(logfilenames, list) and os.path.isdir(logfilenames):
logfilenames = [os.path.join(os.path.dirname(logfilenames), l) for l in os.listdir(logfilenames) if
os.path.splitext(l)[-1] == '.log']

for logfilename in logfilenames:
with open(logfilename, 'r') as f:
# Record of slicing and data associated with the slice
Expand Down Expand Up @@ -94,6 +212,48 @@ def analyze(logfilenames, clipterminals, bytmp, bybitfield):
data.setdefault(sl, {}).setdefault('phase ' + phase, []).append(phase_time[phase])
data.setdefault(sl, {}).setdefault('%usort', []).append(100 * n_uniform // n_sorts)

# Grab the time ended, compute the time started
time_ended = time.mktime(datetime.datetime.strptime(line.split(')')[-1][1:-1], '%a %b %d %H:%M:%S %Y').timetuple())
data.setdefault(sl, {}).setdefault('time ended', []).append(time_ended)
data.setdefault(sl, {}).setdefault('time started', []).append(time_ended - float(m.group(1)))

if figfile is not None:
# Prepare report
for sl in data.keys():

# This array will hold start and end data (in hours)
data_started_ended = np.array([[ts, te, te-ts] for
ts, te in zip(data[sl]['time started'], data[sl]['time ended'])
]) / (60 * 60)
assert data_started_ended.shape[0] >= 3, 'Cannot generate figure with less than 3 datapoints ({} datapoints passed)'.format(data_started_ended.shape[0])

# Sift the data so that it starts at zero
data_started_ended -= np.min(data_started_ended[:, 0])

# Sort the rows by start time
data_started_ended = data_started_ended[np.argsort(data_started_ended[:, 0])]

# Create figure
num_plots = 4
f, _ = plt.subplots(2,1, figsize=(8, 12))
ax = plt.subplot(num_plots,1,1)
ax.set_title('Plot performance summary')

create_ax_dumbbell(ax, data_started_ended)

ax = plt.subplot(num_plots,1,2)
create_ax_plotrate(ax, data_started_ended, end=True, window=3)

ax = plt.subplot(num_plots,1,3)
create_ax_plottime(ax, data_started_ended, window=3)

ax = plt.subplot(num_plots,1,4)
create_ax_plotcumulative(ax, data_started_ended)

print('Saving analysis figure to {}'.format(figfile))
ax.set_xlabel('Time (hours)')
f.savefig(figfile)

# Prepare report
tab = tt.Texttable()
all_measures = ['%usort', 'phase 1', 'phase 2', 'phase 3', 'phase 4', 'total time']
Expand Down
17 changes: 13 additions & 4 deletions src/plotman/plotman.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ def parse_args(self):
p_analyze.add_argument('--bybitfield',
action='store_true',
help='slice by bitfield/non-bitfield sorting')
p_analyze.add_argument('logfile', type=str, nargs='+',
p_analyze.add_argument('--logfile', type=str, nargs='+', default=None,
help='logfile(s) to analyze')
p_analyze.add_argument('--logdir', type=str, default=None,
help='directory containing multiple logfiles to analyze')
altendky marked this conversation as resolved.
Show resolved Hide resolved
p_analyze.add_argument('--figfile', type=str, default=None,
help='figure to be created if logdir is passed')
nikwl marked this conversation as resolved.
Show resolved Hide resolved

args = parser.parse_args()
return args
Expand Down Expand Up @@ -155,9 +159,14 @@ def main():
# Analysis of completed jobs
#
elif args.cmd == 'analyze':

analyzer.analyze(args.logfile, args.clipterminals,
args.bytmp, args.bybitfield)
if args.logfile is not None:
analyzer.analyze(args.logfile, args.clipterminals,
args.bytmp, args.bybitfield, args.figfile)
elif args.logdir is not None:
analyzer.analyze(args.logdir, args.clipterminals,
args.bytmp, args.bybitfield, args.figfile)
else:
raise RuntimeError('Must pass a log file (--logfile) or a directory containing multiple log files (--logdir)')

else:
jobs = Job.get_running_jobs(cfg.directories.log)
Expand Down