Skip to content

Commit bb68fb1

Browse files
author
jpaten
committed
feat: added spreadsheet functions to ga package (#4305)
1 parent 19e0f60 commit bb68fb1

File tree

3 files changed

+338
-0
lines changed

3 files changed

+338
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
import gspread
2+
from enum import Enum
3+
from googleapiclient.discovery import build
4+
5+
6+
import gspread_formatting
7+
8+
class FILE_OVERRIDE_BEHAVIORS(Enum):
9+
OVERRIDE_IF_IN_SAME_PLACE = 1
10+
EXIT_IF_IN_SAME_PLACE = 2
11+
EXIT_ANYWHERE = 3
12+
13+
class WORKSHEET_OVERRIDE_BEHAVIORS(Enum):
14+
OVERRIDE = 1
15+
EXIT = 2
16+
17+
FONT_SIZE_PTS = 10
18+
PTS_PIXELS_RATIO = 4/3
19+
DEFAULT_BUFFER_CHARS = 4
20+
21+
def extract_credentials(authentication_response):
22+
"""Extracts the credentials from the tuple from api.authenticate"""
23+
return authentication_response[3]
24+
25+
def authenticate_gspread(authentication_response):
26+
"""Authenticates the gspread client using the credentials in the tuple from api.authenticate"""
27+
gc = gspread.authorize(extract_credentials(authentication_response))
28+
return gc
29+
30+
def authenticate_drive_api(authentication_response):
31+
"""Authenticates the Drive API using the credentials in the tuple from api.authenticate"""
32+
return build('drive', 'v3', credentials=extract_credentials(authentication_response))
33+
34+
def check_sheet_exists(gc, sheet_name):
35+
"""
36+
Determine if a sheet named 'sheet_name' exists anywhere in the project.
37+
38+
:param gc: the gspread client
39+
:param sheet_name: the name of the sheet to check for
40+
:returns: True if the sheet exists, otherwise False
41+
"""
42+
try:
43+
gc.open(sheet_name)
44+
return True
45+
except gspread.exceptions.SpreadsheetNotFound:
46+
return False
47+
48+
def execute_drive_list(drive_api, search_params):
49+
"""
50+
Execute a files().list() request on the Drive API with the given search parameters.
51+
Returns the 'files' components of the response.
52+
53+
Positional arguments:
54+
:param drive_api: the Drive API object
55+
:param search_params: the search parameters, see https://developers.google.com/drive/api/v3/search-files
56+
:returns: the 'files' components of the response
57+
"""
58+
files_found = []
59+
page_token = None
60+
while True:
61+
request = drive_api.files().list(q=search_params, spaces="drive", pageToken=page_token)
62+
response = request.execute()
63+
page_token = response.get("nextPageToken", None)
64+
files_found += response.get("files", [])
65+
if page_token is None:
66+
break
67+
return files_found
68+
69+
def search_for_folder_id(drive_api, folder_name, allow_trashed = False, allow_duplicates = False):
70+
"""
71+
Search for a folder by name in the Drive API.
72+
Returns a list of folder ids that match the search criteria.
73+
74+
:param drive_api: the Drive API object
75+
:param folder_name: the name of the folder to search for
76+
:param allow_trashed: whether to include trashed folders in the search, defaults to False
77+
:param allow_duplicates: whether to allow multiple folders with the same name, defaults to False
78+
:returns: a list of folder ids that match the search criteria
79+
"""
80+
search_params = f"name = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder'"
81+
if not allow_trashed:
82+
search_params += " and trashed = false"
83+
84+
files = execute_drive_list(drive_api, search_params)
85+
files_exact_match = tuple(filter(lambda file: file["name"] == folder_name, files))
86+
87+
if len(files_exact_match) > 1:
88+
if not allow_duplicates:
89+
raise RuntimeError("Too many files returned")
90+
91+
return [file["id"] for file in files_exact_match]
92+
93+
94+
def create_sheet_in_folder(authentication_response, sheet_name, parent_folder_name=None, override_behavior=FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE):
95+
"""
96+
Create a new sheet in the project with the given name and parent folder.
97+
Returns the new sheet.
98+
99+
:param authentication_response: the service parameters tuple
100+
:param sheet_name: the name of the new sheet
101+
:param parent_folder_name: the name of the parent folder for the new sheet
102+
:param override_behavior: the behavior to take if the sheet already exists
103+
:returns: the gspread.Spreadsheet object of the new sheet
104+
:rtype: gspread.Spreadsheet
105+
"""
106+
# Build Drive API
107+
drive_credentials = extract_credentials(authentication_response)
108+
gc = gspread.authorize(drive_credentials)
109+
drive_api = build('drive', 'v3', credentials=drive_credentials)
110+
parent_folder_id = None if parent_folder_name is None else search_for_folder_id(drive_api, parent_folder_name)[0]
111+
112+
# Check if sheet already exists and handle based on input
113+
if check_sheet_exists(gc, sheet_name):
114+
if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_ANYWHERE:
115+
raise RuntimeError("Sheet already exists")
116+
matching_search = f"name = '{sheet_name}' and mimeType = 'application/vnd.google-apps.spreadsheet'"
117+
if parent_folder_id is None:
118+
matching_search += " and 'root' in parents"
119+
else:
120+
matching_search += f" and '{parent_folder_id}' in parents"
121+
matching_files = execute_drive_list(drive_api, matching_search)
122+
123+
if len(matching_files) > 0:
124+
if override_behavior == FILE_OVERRIDE_BEHAVIORS.EXIT_IF_IN_SAME_PLACE:
125+
raise RuntimeError("File already exists in the same folder")
126+
elif override_behavior == FILE_OVERRIDE_BEHAVIORS.OVERRIDE_IF_IN_SAME_PLACE:
127+
for file in matching_files:
128+
drive_api.files().delete(fileId=file["id"]).execute()
129+
# Create file body
130+
body = {
131+
'name': sheet_name,
132+
'mimeType': 'application/vnd.google-apps.spreadsheet',
133+
}
134+
if parent_folder_id is not None:
135+
body["parents"] = [parent_folder_id]
136+
request = drive_api.files().create(body=body)
137+
new_sheet = request.execute()
138+
139+
# Get id of fresh sheet
140+
spread_id = new_sheet["id"]
141+
142+
# Open new file
143+
return gc.open_by_key(spread_id)
144+
145+
def fill_worksheet_with_df(
146+
sheet,
147+
df,
148+
worksheet_name,
149+
overlapBehavior,
150+
options={
151+
"bold_header": True,
152+
"center_header": True,
153+
"freeze_header": True,
154+
"column_widths": {"justify": True, "buffer_chars": DEFAULT_BUFFER_CHARS}
155+
}
156+
):
157+
"""
158+
Fill a worksheet with the contents of a DataFrame.
159+
If the worksheet already exists, the behavior is determined by overlapBehavior.
160+
The options dictionary can be used to customize the formatting of the worksheet.
161+
162+
:param sheet: the gspread.Spreadsheet object
163+
:param df: the DataFrame to fill the worksheet with
164+
:param worksheet_name: the name of the worksheet to fill. Cannot be "Sheet1"
165+
:param overlapBehavior: the behavior to take if the worksheet already exists.
166+
:param options: the formatting options for the worksheet.
167+
Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional
168+
"""
169+
# Sheet1 is special since it's created by default, so it's not allowed
170+
assert worksheet_name != "Sheet1"
171+
172+
# Check if worksheet already exists and handle based on overlapBehavior
173+
try:
174+
worksheet = sheet.worksheet(worksheet_name)
175+
if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT:
176+
raise RuntimeError("Worksheet already exists")
177+
except gspread.exceptions.WorksheetNotFound:
178+
worksheet = sheet.add_worksheet(
179+
title=worksheet_name, rows=df.shape[0], cols=df.shape[1]
180+
)
181+
182+
# Add data to worksheet
183+
worksheet.update([df.columns.values.tolist()] + df.values.tolist())
184+
185+
# Format worksheet
186+
# Justify Column Widths
187+
if "column_widths" not in options or options["column_widths"]["justify"]:
188+
text_widths = df.astype(str).columns.map(
189+
lambda column_name: df[column_name].astype(str).str.len().max()
190+
)
191+
header_widths = df.columns.str.len()
192+
column_widths = [
193+
round((max(len_tuple) + options["column_widths"]["buffer_chars"]) * FONT_SIZE_PTS * 1/PTS_PIXELS_RATIO)
194+
for len_tuple in zip(text_widths, header_widths)
195+
]
196+
column_positions = [
197+
gspread.utils.rowcol_to_a1(1, i + 1)[0] for i, _ in enumerate(column_widths)
198+
]
199+
gspread_formatting.set_column_widths(worksheet, zip(column_positions, column_widths))
200+
# Freeze Header
201+
if "freeze_header" not in options or options["freeze_header"]:
202+
gspread_formatting.set_frozen(worksheet, rows=1)
203+
format_options = gspread_formatting.CellFormat()
204+
# Bold Header
205+
if "bold_header" not in options or options["bold_header"]:
206+
format_options += gspread_formatting.CellFormat(textFormat=gspread_formatting.TextFormat(bold=True))
207+
# Center Header
208+
if "center_header" not in options or options["center_header"]:
209+
format_options += gspread_formatting.CellFormat(horizontalAlignment="CENTER")
210+
gspread_formatting.format_cell_range(
211+
worksheet,
212+
f"A1:{gspread.utils.rowcol_to_a1(1, len(df.columns))}",
213+
format_options
214+
)
215+
216+
# Delete Sheet1 if it has been created by default
217+
if "Sheet1" in [i.title for i in sheet.worksheets()]:
218+
sheet.del_worksheet(sheet.worksheet("Sheet1"))
219+
220+
def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior):
221+
"""
222+
Fill a sheet with the contents of a dictionary of DataFrames.
223+
The keys of the dictionary are the names of the worksheets, and the values contain the data to be placed in the sheet.
224+
If any worksheets would be overidden, the behavior is determined by overlapBehavior.
225+
226+
:param sheet: the gspread.Spreadsheet object
227+
:param df_dict: the dictionary of DataFrames to fill the worksheets with
228+
:param overlapBehavior: the behavior to take if any of the worksheets already exist
229+
"""
230+
if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT:
231+
for worksheet_name in df_dict.keys():
232+
try:
233+
sheet.worksheet(worksheet_name)
234+
raise RuntimeError("Worksheet already exists")
235+
except gspread.exceptions.WorksheetNotFound:
236+
pass
237+
for worksheet_name, df in df_dict.items():
238+
fill_worksheet_with_df(sheet, df, worksheet_name, overlapBehavior)
239+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import pandas as pd
2+
from .charts import get_data_df
3+
from urllib.parse import urlparse
4+
5+
def get_flat_data_df(analytics_params, metrics, dimensions, remove_matches=None):
6+
"""
7+
Get a df from the Analytics API with a flat structure (no multiindex).
8+
9+
:param analytics_params: the parameters for the Analytics API, including authentication and property ids
10+
:param metrics: the metrics to get
11+
:param dimensions: the dimensions to get
12+
:param remove_matches: a list of regex patterns or None elements to remove from each dimension.
13+
Each regex or None element should correspond with an element of dimensions and remove_matches must be the same length as dimensions.
14+
If the value is None, no patterns are removed, defaults to None.
15+
16+
:return: a DataFrame with the data from the Analytics API
17+
"""
18+
if remove_matches is not None:
19+
assert len(remove_matches) == len(dimensions)
20+
21+
df = get_data_df(
22+
metrics,
23+
dimensions,
24+
**analytics_params,
25+
)
26+
if remove_matches is not None:
27+
for i, match in enumerate(remove_matches):
28+
if match is not None:
29+
df = df.loc[~df.index.get_level_values(i).str.fullmatch(match)]
30+
return df.reset_index().copy()
31+
32+
33+
def get_outbound_sheets_df(analytics_params):
34+
"""
35+
Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links.
36+
37+
:param analytics_params: the parameters for the Analytics API, including authentication and property ids
38+
:return: a DataFrame with the outbound links from the Analytics API
39+
"""
40+
pd.set_option('future.no_silent_downcasting', True)
41+
# Get the builtin "Click" event
42+
df_builtin_links = get_flat_data_df(
43+
analytics_params,
44+
["eventCount", "totalUsers"],
45+
["pagePath", "linkUrl", "eventName"],
46+
remove_matches=[None, r"\s*", None]
47+
).groupby(
48+
["pagePath", "linkUrl"]
49+
).sum().reset_index().rename(
50+
columns={"linkUrl": "builtin_url"}
51+
)
52+
53+
# Get the custom "outbound_link_click" event
54+
df_custom_links = get_flat_data_df(
55+
analytics_params,
56+
["eventCount", "totalUsers"],
57+
["pagePath", "customEvent:click_url", "eventName"],
58+
remove_matches=[None, r"\(not set\)", None],
59+
).groupby(
60+
["pagePath", "customEvent:click_url"]
61+
).sum().reset_index().rename(
62+
columns={"customEvent:click_url": "outbound_url"}
63+
)
64+
# Concatenate the two dataframes, avoiding duplicates
65+
# Keep the link from the builtin event, unless the link contains a #fragment, in which case keep the link from the custom event
66+
df_builtin_links["builtin"] = True
67+
df_builtin_links["truncated_url"] = df_builtin_links["builtin_url"]
68+
df_custom_links["truncated_url"] = df_custom_links["outbound_url"].str.replace(r"#.*", "", regex=True)
69+
df_outbound_links_fragments = df_custom_links.loc[df_custom_links["outbound_url"].str.contains("#")]
70+
df_outbound_links_fragments["is_fragment"] = True
71+
df_all_links = pd.concat(
72+
[df_builtin_links, df_outbound_links_fragments], ignore_index=True
73+
)
74+
df_all_links = df_all_links.loc[
75+
~(df_all_links["truncated_url"].isin(df_outbound_links_fragments["truncated_url"]) & df_all_links["builtin"])
76+
].sort_values("eventCount", ascending=False)
77+
# Determine whther a link is a fragment or an outbound link
78+
df_all_links["outbound"] = df_all_links["truncated_url"].isin(df_custom_links["truncated_url"])
79+
df_all_links["is_fragment"] = df_all_links["is_fragment"].fillna(False).astype(bool)
80+
df_all_links["complete_url"] = df_all_links["builtin_url"].where(
81+
~df_all_links["is_fragment"],
82+
df_all_links["outbound_url"]
83+
)
84+
df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname)
85+
df_all_links = df_all_links.drop(
86+
columns=["builtin_url", "outbound_url", "builtin", "is_fragment"]
87+
).rename(
88+
columns={
89+
"pagePath": "Page Path",
90+
"complete_url": "Outbound Link",
91+
"eventCount": "Total Clicks",
92+
"totalUsers": "Total Users",
93+
"outbound": "Is Outbound",
94+
"hostname": "Hostname",
95+
}
96+
)[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users", "Is Outbound"]]
97+
return df_all_links.copy().reset_index(drop=True)

analytics/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ google-auth==2.34.0
2626
google-auth-httplib2==0.2.0
2727
google-auth-oauthlib==1.2.1
2828
googleapis-common-protos==1.65.0
29+
gspread==6.1.4
30+
gspread-formatting==1.2.0
2931
httplib2==0.22.0
3032
idna==3.10
3133
imagesize==1.4.1

0 commit comments

Comments
 (0)