Skip to content

Commit e8c34c6

Browse files
author
Jonah Paten
authored
feat: 4305 added gspread analytics package (#4314)
* feat: added spreadsheet functions to ga package (#4305) * chore: refactored ga metric and dimension names to constants (#4305) * chore: finished refactor (#4305) * chore: bumped analytics package version (#4310) * chore: added new dependncies to setup.py (#4310) * fix: removed redundant api authentication (#4305)
1 parent b37e1ba commit e8c34c6

File tree

6 files changed

+378
-2
lines changed

6 files changed

+378
-2
lines changed

analytics/analytics_package/analytics/api.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424
lambda service, params: service.reports().query(**params).execute()
2525
)
2626

27+
drive_service_params = (
28+
["https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/spreadsheets"],
29+
"drive", "v3",
30+
{},
31+
)
32+
2733
next_port = None
2834
default_service_system = None
2935

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Metric names
2+
METRIC_EVENT_COUNT = 'eventCount'
3+
METRIC_TOTAL_USERS = 'totalUsers'
4+
5+
# DIMENSIONS
6+
DIMENSION_PAGE_PATH = {
7+
'id': 'pagePath',
8+
'alias': 'page_path',
9+
'remove_matches': None,
10+
}
11+
DIMENSION_BUILTIN_URL = {
12+
'id': 'linkUrl',
13+
'alias': 'builtin_url',
14+
'remove_matches': r"\s*",
15+
}
16+
DIMENSION_EVENT_NAME = {
17+
'id': 'eventName',
18+
'alias': 'event_name',
19+
'remove_matches': None,
20+
}
21+
DIMENSION_CUSTOM_URL = {
22+
'id': 'customEvent:click_url',
23+
'alias': 'outbound_url',
24+
'remove_matches': r"\(not set\)",
25+
}
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
import gspread
2+
import gspread_formatting
3+
from enum import Enum
4+
from googleapiclient.discovery import build
5+
6+
class FILE_OVERRIDE_BEHAVIORS(Enum):
7+
OVERRIDE_IF_IN_SAME_PLACE = 1
8+
EXIT_IF_IN_SAME_PLACE = 2
9+
EXIT_ANYWHERE = 3
10+
11+
class WORKSHEET_OVERRIDE_BEHAVIORS(Enum):
12+
OVERRIDE = 1
13+
EXIT = 2
14+
15+
FONT_SIZE_PTS = 10
16+
PTS_PIXELS_RATIO = 4/3
17+
DEFAULT_BUFFER_CHARS = 2
18+
19+
def extract_credentials(authentication_response):
20+
"""Extracts the credentials from the tuple from api.authenticate"""
21+
return authentication_response[3]
22+
23+
def authenticate_gspread(authentication_response):
24+
"""Authenticates the gspread client using the credentials in the tuple from api.authenticate"""
25+
gc = gspread.authorize(extract_credentials(authentication_response))
26+
return gc
27+
28+
def authenticate_drive_api(authentication_response):
29+
"""Authenticates the Drive API using the response from api.authenticate"""
30+
return authentication_response[0]
31+
32+
def check_sheet_exists(gc, sheet_name):
33+
"""
34+
Determine if a sheet named 'sheet_name' exists anywhere in the project.
35+
36+
:param gc: the gspread client
37+
:param sheet_name: the name of the sheet to check for
38+
:returns: True if the sheet exists, otherwise False
39+
"""
40+
try:
41+
gc.open(sheet_name)
42+
return True
43+
except gspread.exceptions.SpreadsheetNotFound:
44+
return False
45+
46+
def execute_drive_list(drive_api, search_params):
47+
"""
48+
Execute a files().list() request on the Drive API with the given search parameters.
49+
Returns the 'files' components of the response.
50+
51+
Positional arguments:
52+
:param drive_api: the Drive API object
53+
:param search_params: the search parameters, see https://developers.google.com/drive/api/v3/search-files
54+
:returns: the 'files' components of the response
55+
"""
56+
files_found = []
57+
page_token = None
58+
while True:
59+
request = drive_api.files().list(q=search_params, spaces="drive", pageToken=page_token)
60+
response = request.execute()
61+
page_token = response.get("nextPageToken", None)
62+
files_found += response.get("files", [])
63+
if page_token is None:
64+
break
65+
return files_found
66+
67+
def search_for_folder_id(drive_api, folder_name, allow_trashed = False, allow_duplicates = False):
68+
"""
69+
Search for a folder by name in the Drive API.
70+
Returns a list of folder ids that match the search criteria.
71+
72+
:param drive_api: the Drive API object
73+
:param folder_name: the name of the folder to search for
74+
:param allow_trashed: whether to include trashed folders in the search, defaults to False
75+
:param allow_duplicates: whether to allow multiple folders with the same name, defaults to False
76+
:returns: a list of folder ids that match the search criteria
77+
"""
78+
search_params = f"name = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder'"
79+
if not allow_trashed:
80+
search_params += " and trashed = false"
81+
82+
files = execute_drive_list(drive_api, search_params)
83+
files_exact_match = tuple(filter(lambda file: file["name"] == folder_name, files))
84+
85+
if len(files_exact_match) > 1:
86+
if not allow_duplicates:
87+
raise RuntimeError("Too many files returned")
88+
if len(files_exact_match) == 0:
89+
raise RuntimeError("No such folder exists")
90+
91+
return [file["id"] for file in files_exact_match]
92+
93+
94+
def create_sheet_in_folder(authentication_response, sheet_name, parent_folder_name=None, override_behavior=FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE):
95+
"""
96+
Create a new sheet in the project with the given name and parent folder.
97+
Returns the new sheet.
98+
99+
:param authentication_response: the service parameters tuple
100+
:param sheet_name: the name of the new sheet
101+
:param parent_folder_name: the name of the parent folder for the new sheet
102+
:param override_behavior: the behavior to take if the sheet already exists
103+
:returns: the gspread.Spreadsheet object of the new sheet
104+
:rtype: gspread.Spreadsheet
105+
"""
106+
# Build Drive API
107+
gc = authenticate_gspread(authentication_response)
108+
drive_api = authenticate_drive_api(authentication_response)
109+
parent_folder_id = None if parent_folder_name is None else search_for_folder_id(drive_api, parent_folder_name)[0]
110+
111+
# Check if sheet already exists and handle based on input
112+
if check_sheet_exists(gc, sheet_name):
113+
if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE:
114+
raise RuntimeError("Sheet already exists")
115+
matching_search = f"name = '{sheet_name}' and mimeType = 'application/vnd.google-apps.spreadsheet'"
116+
if parent_folder_id is None:
117+
matching_search += " and 'root' in parents"
118+
else:
119+
matching_search += f" and '{parent_folder_id}' in parents"
120+
matching_files = execute_drive_list(drive_api, matching_search)
121+
122+
if len(matching_files) > 0:
123+
if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_IF_IN_SAME_PLACE:
124+
raise RuntimeError("File already exists in the same folder")
125+
elif override_behavior == FILE_OVERRIDE_BEHAVIORS.OVERRIDE_IF_IN_SAME_PLACE:
126+
for file in matching_files:
127+
drive_api.files().delete(fileId=file["id"]).execute()
128+
# Create file body
129+
body = {
130+
'name': sheet_name,
131+
'mimeType': 'application/vnd.google-apps.spreadsheet',
132+
}
133+
if parent_folder_id is not None:
134+
body["parents"] = [parent_folder_id]
135+
request = drive_api.files().create(body=body)
136+
new_sheet = request.execute()
137+
138+
# Get id of fresh sheet
139+
spread_id = new_sheet["id"]
140+
141+
# Open new file
142+
return gc.open_by_key(spread_id)
143+
144+
def fill_worksheet_with_df(
145+
sheet,
146+
df,
147+
worksheet_name,
148+
overlapBehavior,
149+
options={
150+
"bold_header": True,
151+
"center_header": True,
152+
"freeze_header": True,
153+
"column_widths": {"justify": True, "buffer_chars": DEFAULT_BUFFER_CHARS}
154+
}
155+
):
156+
"""
157+
Fill a worksheet with the contents of a DataFrame.
158+
If the worksheet already exists, the behavior is determined by overlapBehavior.
159+
The options dictionary can be used to customize the formatting of the worksheet.
160+
161+
:param sheet: the gspread.Spreadsheet object
162+
:param df: the DataFrame to fill the worksheet with
163+
:param worksheet_name: the name of the worksheet to fill. Cannot be "Sheet1"
164+
:param overlapBehavior: the behavior to take if the worksheet already exists.
165+
:param options: the formatting options for the worksheet.
166+
Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional
167+
"""
168+
# Sheet1 is special since it's created by default, so it's not allowed
169+
assert worksheet_name != "Sheet1"
170+
171+
# Check if worksheet already exists and handle based on overlapBehavior
172+
try:
173+
worksheet = sheet.worksheet(worksheet_name)
174+
if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT:
175+
raise RuntimeError("Worksheet already exists")
176+
except gspread.exceptions.WorksheetNotFound:
177+
worksheet = sheet.add_worksheet(
178+
title=worksheet_name, rows=df.shape[0], cols=df.shape[1]
179+
)
180+
181+
# Add data to worksheet
182+
worksheet.update([df.columns.values.tolist()] + df.values.tolist())
183+
184+
# Format worksheet
185+
# Justify Column Widths
186+
if "column_widths" not in options or options["column_widths"]["justify"]:
187+
text_widths = df.astype(str).columns.map(
188+
lambda column_name: df[column_name].astype(str).str.len().max()
189+
)
190+
header_widths = df.columns.str.len()
191+
buffer_chars = (
192+
DEFAULT_BUFFER_CHARS
193+
if ("column_widths" not in options or "buffer_chars" not in options["column_widths"])
194+
else options["column_widths"]["buffer_chars"]
195+
)
196+
column_widths = [
197+
round((max(len_tuple) + buffer_chars) * FONT_SIZE_PTS * 1/PTS_PIXELS_RATIO)
198+
for len_tuple in zip(text_widths, header_widths)
199+
]
200+
column_positions = [
201+
gspread.utils.rowcol_to_a1(1, i + 1)[0] for i, _ in enumerate(column_widths)
202+
]
203+
gspread_formatting.set_column_widths(worksheet, zip(column_positions, column_widths))
204+
# Freeze Header
205+
if "freeze_header" not in options or options["freeze_header"]:
206+
gspread_formatting.set_frozen(worksheet, rows=1)
207+
format_options = gspread_formatting.CellFormat()
208+
# Bold Header
209+
if "bold_header" not in options or options["bold_header"]:
210+
format_options += gspread_formatting.CellFormat(textFormat=gspread_formatting.TextFormat(bold=True))
211+
# Center Header
212+
if "center_header" not in options or options["center_header"]:
213+
format_options += gspread_formatting.CellFormat(horizontalAlignment="CENTER")
214+
gspread_formatting.format_cell_range(
215+
worksheet,
216+
f"A1:{gspread.utils.rowcol_to_a1(1, len(df.columns))}",
217+
format_options
218+
)
219+
220+
# Delete Sheet1 if it has been created by default
221+
if "Sheet1" in [i.title for i in sheet.worksheets()]:
222+
sheet.del_worksheet(sheet.worksheet("Sheet1"))
223+
224+
def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, options={}):
225+
"""
226+
Fill a sheet with the contents of a dictionary of DataFrames.
227+
The keys of the dictionary are the names of the worksheets, and the values contain the data to be placed in the sheet.
228+
If any worksheets would be overidden, the behavior is determined by overlapBehavior.
229+
230+
:param sheet: the gspread.Spreadsheet object
231+
:param df_dict: the dictionary of DataFrames to fill the worksheets with
232+
:param overlapBehavior: the behavior to take if any of the worksheets already exist
233+
:param options: the formatting options for the worksheets.
234+
Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional
235+
"""
236+
if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT:
237+
for worksheet_name in df_dict.keys():
238+
try:
239+
sheet.worksheet(worksheet_name)
240+
raise RuntimeError("Worksheet already exists")
241+
except gspread.exceptions.WorksheetNotFound:
242+
pass
243+
for worksheet_name, df in df_dict.items():
244+
fill_worksheet_with_df(sheet, df, worksheet_name, overlapBehavior, options=options)
245+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import pandas as pd
2+
from .charts import get_data_df
3+
from .fields import *
4+
from urllib.parse import urlparse
5+
6+
def get_flat_data_df(analytics_params, metrics, dimensions, remove_matches=None):
7+
"""
8+
Get a df from the Analytics API with a flat structure (no multiindex).
9+
10+
:param analytics_params: the parameters for the Analytics API, including authentication and property ids
11+
:param metrics: the metrics to get
12+
:param dimensions: the dimensions to get
13+
:param remove_matches: a list of regex patterns or None elements to remove from each dimension.
14+
Each regex or None element should correspond with an element of dimensions and remove_matches must be the same length as dimensions.
15+
If the value is None, no patterns are removed, defaults to None.
16+
17+
:return: a DataFrame with the data from the Analytics API
18+
"""
19+
if remove_matches is not None:
20+
assert len(remove_matches) == len(dimensions)
21+
22+
df = get_data_df(
23+
metrics,
24+
[dimension["id"] for dimension in dimensions],
25+
**analytics_params,
26+
)
27+
if remove_matches is not None:
28+
for i, match in enumerate([dimension["remove_matches"] for dimension in dimensions]):
29+
if match is not None:
30+
df = df.loc[~df.index.get_level_values(i).str.fullmatch(match)]
31+
return df.reset_index().rename(columns=get_rename_dict(dimensions)).copy()
32+
33+
def get_rename_dict(dimensions):
34+
"""Get a dictionary to rename the columns of a DataFrame."""
35+
return dict(
36+
zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions])
37+
)
38+
39+
def get_outbound_sheets_df(analytics_params):
40+
"""
41+
Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links.
42+
43+
:param analytics_params: the parameters for the Analytics API, including authentication and property ids
44+
:return: a DataFrame with the outbound links from the Analytics API
45+
"""
46+
pd.set_option('future.no_silent_downcasting', True)
47+
# Get the builtin "Click" event
48+
df_builtin_links = get_flat_data_df(
49+
analytics_params,
50+
[METRIC_EVENT_COUNT, METRIC_TOTAL_USERS],
51+
[DIMENSION_PAGE_PATH, DIMENSION_BUILTIN_URL, DIMENSION_EVENT_NAME],
52+
remove_matches=[None, r"\s*", None]
53+
).groupby(
54+
[DIMENSION_PAGE_PATH["alias"], DIMENSION_BUILTIN_URL["alias"]]
55+
).sum().reset_index()
56+
57+
# Get the custom "outbound_link_click" event
58+
df_custom_links = get_flat_data_df(
59+
analytics_params,
60+
[METRIC_EVENT_COUNT, METRIC_TOTAL_USERS],
61+
[DIMENSION_EVENT_NAME, DIMENSION_CUSTOM_URL, DIMENSION_PAGE_PATH],
62+
remove_matches=[DIMENSION_EVENT_NAME["remove_matches"], r"\(not set\)", None],
63+
).groupby(
64+
[DIMENSION_PAGE_PATH["alias"], DIMENSION_CUSTOM_URL["alias"]]
65+
).sum().reset_index()
66+
# Concatenate the two dataframes, avoiding duplicates
67+
# Keep the link from the builtin event, unless the link contains a #fragment, in which case keep the link from the custom event
68+
df_builtin_links["builtin"] = True
69+
df_builtin_links["truncated_url"] = df_builtin_links[DIMENSION_BUILTIN_URL["alias"]]
70+
df_custom_links["truncated_url"] = df_custom_links[DIMENSION_CUSTOM_URL["alias"]].str.replace(r"#.*", "", regex=True)
71+
df_outbound_links_fragments = df_custom_links.loc[df_custom_links[DIMENSION_CUSTOM_URL["alias"]].str.contains("#")].copy()
72+
df_outbound_links_fragments["is_fragment"] = True
73+
df_all_links = pd.concat(
74+
[df_builtin_links, df_outbound_links_fragments], ignore_index=True
75+
)
76+
# Use the builtin link, unless the link is not in the custom links, in which case use the custom link
77+
df_all_links = df_all_links.loc[
78+
~(df_all_links["truncated_url"].isin(df_outbound_links_fragments["truncated_url"]) & df_all_links["builtin"])
79+
].sort_values(METRIC_EVENT_COUNT, ascending=False)
80+
df_all_links["is_fragment"] = df_all_links["is_fragment"].fillna(False).astype(bool)
81+
# Use the builtin link, unless the link is a fragment, in which case use the custom link
82+
df_all_links["complete_url"] = df_all_links["builtin_url"].where(
83+
~df_all_links["is_fragment"],
84+
df_all_links["outbound_url"]
85+
)
86+
df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname)
87+
df_all_links = df_all_links.drop(
88+
columns=["builtin_url", "outbound_url", "builtin", "is_fragment"]
89+
).rename(
90+
columns={
91+
DIMENSION_PAGE_PATH["alias"]: "Page Path",
92+
"complete_url": "Outbound Link",
93+
METRIC_EVENT_COUNT: "Total Clicks",
94+
METRIC_TOTAL_USERS: "Total Users",
95+
"hostname": "Hostname",
96+
}
97+
)[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users"]]
98+
return df_all_links.copy().reset_index(drop=True)

analytics/analytics_package/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="analytics",
5-
version="3.0.1",
5+
version="3.1.0",
66
packages=["analytics"],
7-
install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client"],
7+
install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client", "gspread", "gspread-formatting"],
88
)

0 commit comments

Comments
 (0)