Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(reader): add range named parameter #57

Merged
merged 11 commits into from
Feb 6, 2025
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ _site/
credentials.json
compile_commands.json
.cache/clangd/
.helix/
9 changes: 8 additions & 1 deletion docs/pages/index.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
---
title: DuckDB GSheets
description: A DuckDB extension for reading and writing Google Sheets with SQL.
hide_title: true
---

Expand Down Expand Up @@ -62,6 +63,13 @@ SELECT * FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', all_va
-- Read a sheet other than the first sheet using the sheet name
SELECT * FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet2');

-- Read a spreadsheet using a specific range
SELECT * FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1', range='B1:C7');
-- or using A1 notation
SELECT * FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1!B1:C7');
-- or from range in URL
SELECT * FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=0#gid=0&range=B1:C7');

-- Read a sheet other than the first sheet using the sheet id in the URL
SELECT * FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=644613997#gid=644613997');
```
Expand Down Expand Up @@ -110,7 +118,6 @@ This token will periodically expire - you can re-run the above command again to

- DuckDB WASM is not (yet) supported.
- Google Sheets has a limit of 10,000,000 cells per spreadsheet.
- Reading sheets where data does not start in A1 is not yet supported.
- Writing data to a sheet starting from a cell other than A1 is not yet supported.
- Sheets must already exist to COPY TO them.

Expand Down
6 changes: 4 additions & 2 deletions src/gsheets_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ namespace duckdb

// Make the API call to write data to the Google Sheet
// Today, this is only append.
std::string response = call_sheets_api(spreadsheet_id, token, encoded_sheet_name, HttpMethod::POST, request_body);
// TODO: add support for ranged writes https://developers.google.com/sheets/api/samples/writing
std::string response = call_sheets_api(spreadsheet_id, token, encoded_sheet_name, "", HttpMethod::POST, request_body);

// Check for errors in the response
json response_json = parseJson(response);
Expand Down Expand Up @@ -142,7 +143,8 @@ namespace duckdb

// Make the API call to write data to the Google Sheet
// Today, this is only append.
std::string response = call_sheets_api(gstate.spreadsheet_id, gstate.token, encoded_sheet_name, HttpMethod::POST, request_body);
// TODO: add support for ranged writes https://developers.google.com/sheets/api/samples/writing
std::string response = call_sheets_api(gstate.spreadsheet_id, gstate.token, encoded_sheet_name, "", HttpMethod::POST, request_body);

// Check for errors in the response
json response_json = parseJson(response);
Expand Down
1 change: 1 addition & 0 deletions src/gsheets_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ static void LoadInternal(DatabaseInstance &instance) {
TableFunction read_gsheet_function("read_gsheet", {LogicalType::VARCHAR}, ReadSheetFunction, ReadSheetBind);
read_gsheet_function.named_parameters["header"] = LogicalType::BOOLEAN;
read_gsheet_function.named_parameters["sheet"] = LogicalType::VARCHAR;
read_gsheet_function.named_parameters["range"] = LogicalType::VARCHAR;
read_gsheet_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
ExtensionUtil::RegisterFunction(instance, read_gsheet_function);

Expand Down
42 changes: 38 additions & 4 deletions src/gsheets_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
#include "duckdb/main/secret/secret_manager.hpp"
#include "gsheets_requests.hpp"
#include <json.hpp>
#include <string>
#include <regex>

namespace duckdb {

using json = nlohmann::json;

ReadSheetBindData::ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name)
: spreadsheet_id(spreadsheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name) {
response = call_sheets_api(spreadsheet_id, token, sheet_name, HttpMethod::GET);
ReadSheetBindData::ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name, string sheet_range)
: spreadsheet_id(spreadsheet_id), token(token), finished(false), row_index(0), header(header), sheet_name(sheet_name), sheet_range(sheet_range) {
response = call_sheets_api(spreadsheet_id, token, sheet_name, sheet_range, HttpMethod::GET);
}

bool IsValidNumber(const string& value) {
Expand All @@ -31,6 +33,12 @@ bool IsValidNumber(const string& value) {
}
}

bool IsValidA1Range(const std::string& range) {
// Matches things like A1, $A$1, A1:B2, $A1:$B2, etc.
static const std::regex pattern("^\\$?[A-Za-z]+\\$?[0-9]+(:\\$?[A-Za-z]+\\$?[0-9]+)?$");
return std::regex_match(range, pattern);
}

void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
auto &bind_data = const_cast<ReadSheetBindData&>(data_p.bind_data->Cast<ReadSheetBindData>());

Expand Down Expand Up @@ -105,6 +113,9 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind
// Extract the spreadsheet ID from the input (URL or ID)
std::string spreadsheet_id = extract_spreadsheet_id(sheet_input);

// Try to extract the range from the input (URL or ID)
std::string sheet_range = extract_sheet_range(sheet_input);

// Use the SecretManager to get the token
auto &secret_manager = SecretManager::Get(context);
auto transaction = CatalogTransaction::GetSystemCatalogTransaction(context);
Expand Down Expand Up @@ -146,11 +157,34 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind
throw InvalidInputException("Invalid value for 'use_varchar' parameter. Expected a boolean value.");
}
} else if (kv.first == "sheet") {
// TODO: maybe factor this out to clean up this space
use_explicit_sheet_name = true;
sheet_name = kv.second.GetValue<string>();

// Check if sheet name is quoted and therefore might contain a `!` char that doesn't indicate A1 notation
if (!sheet_name.empty() && sheet_name[0] == '\'') {
size_t closing_quote_pos = sheet_name.find('\'', 1);
if (closing_quote_pos != std::string::npos) {
// Check if there is a `!` char after the closing quote which would indicate A1 notation
if (closing_quote_pos + 1 < sheet_name.size() && sheet_name[closing_quote_pos + 1] == '!') {
sheet_range = sheet_name.substr(closing_quote_pos + 2);
}
// keep only unquoted part of name
sheet_name = sheet_name.substr(1, closing_quote_pos - 1);
}
} else {
// No quotes means any `!` char indicates A1 notation
size_t pos = sheet_name.find("!");
if (pos != std::string::npos) {
sheet_range = sheet_name.substr(pos + 1);
sheet_name = sheet_name.substr(0, pos);
}
}

// Validate that sheet with name exists for better error messaging
sheet_id = get_sheet_id_from_name(spreadsheet_id, sheet_name, token);
} else if (kv.first == "range") {
sheet_range = kv.second.GetValue<string>();
}
}

Expand All @@ -167,7 +201,7 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind

std::string encoded_sheet_name = url_encode(sheet_name);

auto bind_data = make_uniq<ReadSheetBindData>(spreadsheet_id, token, header, encoded_sheet_name);
auto bind_data = make_uniq<ReadSheetBindData>(spreadsheet_id, token, header, encoded_sheet_name, sheet_range);

json cleanJson = parseJson(bind_data->response);
SheetData sheet_data = getSheetData(cleanJson);
Expand Down
6 changes: 5 additions & 1 deletion src/gsheets_requests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,15 @@ namespace duckdb
return response;
}

std::string call_sheets_api(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name, HttpMethod method, const std::string &body)
std::string call_sheets_api(const std::string &spreadsheet_id, const std::string &token, const std::string &sheet_name, const std::string& sheet_range, HttpMethod method, const std::string &body)
{
std::string host = "sheets.googleapis.com";
std::string path = "/v4/spreadsheets/" + spreadsheet_id + "/values/" + sheet_name;

if (!sheet_range.empty()) {
path += "!" + sheet_range;
}

if (method == HttpMethod::POST) {
path += ":append";
path += "?valueInputOption=USER_ENTERED";
Expand Down
11 changes: 11 additions & 0 deletions src/gsheets_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ std::string extract_sheet_id(const std::string& input) {
return "";
}

std::string extract_sheet_range(const std::string& input) {
if (input.find("docs.google.com/spreadsheets/d/") != std::string::npos && input.find("range=") != std::string::npos) {
std::regex sheet_range_regex("range=([^&]+)");
std::smatch match;
if (std::regex_search(input, match, sheet_range_regex) && match.size() > 1) {
return match.str(1);
}
}
return "";
}

std::string get_sheet_name_from_id(const std::string& spreadsheet_id, const std::string& sheet_id, const std::string& token) {
std::string metadata_response = get_spreadsheet_metadata(spreadsheet_id, token);
json metadata = parseJson(metadata_response);
Expand Down
5 changes: 3 additions & 2 deletions src/include/gsheets_read.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,16 @@ struct ReadSheetBindData : public TableFunctionData {
string response;
bool header;
string sheet_name;
string sheet_range;
vector<LogicalType> return_types;
vector<string> names;

ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name);
ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name, string sheet_range);
};

void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output);

unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBindInput &input,
vector<LogicalType> &return_types, vector<string> &names);

} // namespace duckdb
} // namespace duckdb
4 changes: 2 additions & 2 deletions src/include/gsheets_requests.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ enum class HttpMethod {
std::string perform_https_request(const std::string& host, const std::string& path, const std::string& token,
HttpMethod method = HttpMethod::GET, const std::string& body = "", const std::string& content_type = "application/json");

std::string call_sheets_api(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name, HttpMethod method = HttpMethod::GET, const std::string& body = "");
std::string call_sheets_api(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name, const std::string& sheet_range, HttpMethod method = HttpMethod::GET, const std::string& body = "");

std::string delete_sheet_data(const std::string& spreadsheet_id, const std::string& token, const std::string& sheet_name);

std::string get_spreadsheet_metadata(const std::string& spreadsheet_id, const std::string& token);
}
}
7 changes: 7 additions & 0 deletions src/include/gsheets_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ std::string extract_spreadsheet_id(const std::string& input);
*/
std::string extract_sheet_id(const std::string& input);

/**
* Extracts the sheet range from a Google Sheets URL
* @param input A Google Sheets URL
* @return The extracted sheet range
*/
std::string extract_sheet_range(const std::string& input);

/**
* Gets the sheet name from a spreadsheet ID and sheet ID
* @param spreadsheet_id The spreadsheet ID
Expand Down
70 changes: 69 additions & 1 deletion test/sql/read_gsheet.test
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,74 @@ BOS 35.5 23.0 Boston MA Northeast
BNY 56.0 24.5 Brooklyn NY Northeast
BUF 32.5 23.0 Buffalo NY Northeast

# Test the range parameter
query II
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1', range='A2:B7', header=false);
----
Alice 30.0
Bob 25.0
Charlie 45.0
Drake NULL
NULL NULL
Archie 99.0

query II
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1', range='A2:B7');
----
Bob 25.0
Charlie 45.0
Drake NULL
NULL NULL
Archie 99.0

# Test the range parameter from a quoted sheet
query II
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='''Sheet1!''', range='A2:B7');
----
Bob 25.0
Charlie 45.0
Drake NULL
NULL NULL
Archie 99.0

# Test the range parameter from a quoted sheet with A1 notation
query II
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='''Sheet1!''!A2:B7');
----
Bob 25.0
Charlie 45.0
Drake NULL
NULL NULL
Archie 99.0

# Test the range parameter using A1 notation
query II
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1!A2:B7');
----
Bob 25.0
Charlie 45.0
Drake NULL
NULL NULL
Archie 99.0

# Test single value from range
# NOTE: *must* use `header=false` to avoid uncaught bind error
Copy link
Member

@archiewood archiewood Feb 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah i just saw this comment. I wonder why this is?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There must be a bug caused by empty results. I'll need to do some debugging to see if maybe the shape of the fetch sheets response is different for single cell ranges or if the error comes from something more simple. Will investigate and put up an issue once I've tracked down the souce

query I
FROM read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8', sheet='Sheet1', range='A2', header=false);
----
Alice

# Test extract range from URL
query II
FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=0#gid=0&range=B1:C7');
----
30.0 Toronto
25.0 New York
45.0 Chicago
NULL NULL
NULL NULL
99.0 NULL

# Test types - should read numbers as doubles
query I
select age from read_gsheet('11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8') limit 10;
Expand Down Expand Up @@ -102,4 +170,4 @@ more wooting more blah NULL should get this!

# Drop the secret
statement ok
drop secret test_secret;
drop secret test_secret;
Loading