Skip to content

Commit e535537

Browse files
committed
Add more Sqllogictest-style tests
1 parent be82e81 commit e535537

14 files changed

+170
-15
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
[![Docs](https://img.shields.io/badge/docs-read-blue?style=flat&labelColor=282c34&logo=read-the-docs)](https://github.com/CogitatorTech/gaggle/tree/main/docs)
1313
[![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-007ec6?style=flat&labelColor=282c34&logo=open-source-initiative)](https://github.com/CogitatorTech/gaggle)
1414

15-
Kaggle Datasets for DuckDB
15+
Access and Query Kaggle Datasets from DuckDB
1616

1717
</div>
1818

ROADMAP.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ It outlines features to be implemented and their current status.
3939
* [x] CSV and TSV file reading.
4040
* [x] Parquet file reading.
4141
* [x] JSON file reading.
42-
* [ ] Excel (XLSX) file reading. (Available when DuckDB is built with the Excel reader; replacement scan routes `.xlsx` to `read_excel`.)
42+
* [x] Excel (XLSX) file reading.
4343
* **Querying Datasets**
4444
* [x] Replacement scan for `kaggle:` URLs.
4545
* [ ] Virtual table support for lazy loading.
@@ -66,8 +66,8 @@ It outlines features to be implemented and their current status.
6666
* [x] Detailed error codes for programmatic error handling.
6767
* **Resilience**
6868
* [x] Automatic retry on network failures.
69-
* [ ] Graceful degradation when Kaggle API is unavailable.
7069
* [x] Local-only mode for cached datasets (via `GAGGLE_OFFLINE`).
70+
* [ ] Graceful degradation when Kaggle API is unavailable.
7171

7272
### 6. Documentation and Distribution
7373

docs/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The table below includes the information about all SQL functions exposed by Gagg
1616
| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. |
1717
| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. |
1818
| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object/array into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. |
19-
| 13 | `gaggle_file_paths(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
19+
| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
2020

2121
> [!NOTE]
2222
> Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on
@@ -220,6 +220,6 @@ Gaggle is made up of two main components:
220220
- C-compatible FFI surface
221221

222222
2. **C++ DuckDB Bindings (`gaggle/bindings/`)** that:
223-
- Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_paths`, `gaggle_search`)
223+
- Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_path`, `gaggle_search`)
224224
- Integrates with DuckDB’s extension system and replacement scans (`'kaggle:...'`)
225225
- Marshals values between DuckDB vectors and the Rust FFI

docs/examples/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ Each file is self‑contained and can be executed in the DuckDB shell (or via `d
2323
```bash
2424
make examples
2525
```
26+
27+
> [!NOTE]
28+
> Some operations (like search and download) need network access unless `GAGGLE_OFFLINE=1`.
29+
> When offline, these will fail fast if data is not cached locally (not downloaded already).

docs/examples/e1_core_functionality.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,22 @@ limit 5;
2626

2727
-- Section 4: download a dataset
2828
select '## Download a dataset';
29-
select gaggle_download('owid/covid-latest-data') as download_path;
29+
select gaggle_download('uciml/iris') as download_path;
3030

3131
-- Section 5: list files (JSON)
3232
select '## list files (json)';
3333
select to_json(
3434
list(struct_pack(name := name, size := size, path := path))
3535
) as files_json
36-
from gaggle_ls('owid/covid-latest-data');
36+
from gaggle_ls('uciml/iris');
3737

3838
-- Section 5b: list files (table)
3939
select '## list files (table)';
40-
select * from gaggle_ls('owid/covid-latest-data') limit 5;
40+
select * from gaggle_ls('uciml/iris') limit 5;
4141

4242
-- Section 6: get dataset metadata
4343
select '## get dataset metadata';
44-
select gaggle_info('owid/covid-latest-data') as dataset_metadata;
44+
select gaggle_info('uciml/iris') as dataset_metadata;
4545

4646
-- Section 7: get cache information
4747
select '## Get cache information';

docs/examples/e2_advanced_features.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ load 'build/release/extension/gaggle/gaggle.duckdb_extension';
66
select gaggle_set_credentials('your-username', 'your-api-key') as credentials_set;
77

88
-- Get path to specific file
9-
select gaggle_file_paths('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet') as file_path;
9+
select gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet') as file_path;
1010

1111
-- Use the file path with DuckDB's read_parquet via prepared statement (no subqueries in args)
1212
prepare rp as select * from read_parquet(?) limit 10;
13-
execute rp(gaggle_file_paths('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));
13+
execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));
1414

1515
-- Section 2: list and process multiple files
1616
select '## list and process dataset files (json and table)';
@@ -35,7 +35,7 @@ select gaggle_cache_info() as cache_status;
3535

3636
-- Section 4: purge cache if needed
3737
select '## Purge cache (optional)';
38-
-- select gaggle_purge_cache() as cache_purged;
38+
-- select gaggle_clear_cache() as cache_cleared;
3939

4040
-- Section 5: Dataset versioning
4141
select '## Check dataset versions';

docs/examples/e4_json_utils.sql

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
.echo on
2+
3+
-- Example 4: JSON helper utilities
4+
-- Shows how to expand JSON using gaggle_json_each
5+
6+
select '## Load extension';
7+
load 'build/release/extension/gaggle/gaggle.duckdb_extension';
8+
9+
select '## Expand JSON values into newline-delimited JSON rows';
10+
select gaggle_json_each('{"a":1,"b":[true,{"c":"x"}],"d":null}') as rows;
11+
12+
-- Combine with DuckDB JSON functions
13+
with x as (
14+
select gaggle_json_each('{"a":1,"b":[true,{"c":"x"}],"d":null}') as row
15+
)
16+
select
17+
json_type(row) as value_type,
18+
json_extract(row, '$') as raw,
19+
json_extract_string(row, '$.key') as key,
20+
json_extract(row, '$.value') as value
21+
from x;
22+
23+
.echo off
24+

docs/examples/e5_cache_ops.sql

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
.echo on
2+
3+
-- Example 5: Cache operations & housekeeping
4+
-- Demonstrates gaggle_version, gaggle_cache_info, gaggle_clear_cache, gaggle_enforce_cache_limit
5+
6+
select '## Load extension';
7+
load 'build/release/extension/gaggle/gaggle.duckdb_extension';
8+
9+
select '## Extension version';
10+
select gaggle_version() as version;
11+
12+
select '## Cache info (path, size, limit)';
13+
select gaggle_cache_info() as cache_info_json;
14+
15+
select '## Clear cache (optional)';
16+
-- Uncomment to clear the local dataset cache
17+
-- select gaggle_clear_cache() as cache_cleared;
18+
19+
select '## Enforce cache size limit (LRU eviction)';
20+
-- This triggers cleanup based on the configured limit; safe to run repeatedly
21+
select gaggle_enforce_cache_limit() as enforced;
22+
23+
.echo off
24+

gaggle/src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::cell::RefCell;
44
use std::env;
55
use std::path::PathBuf;
66

7-
const DEFAULT_CACHE_DIR_NAME: &str = "gaggle_cache";
7+
const DEFAULT_CACHE_DIR_NAME: &str = "gaggle";
88

99
pub static CONFIG: Lazy<GaggleConfig> = Lazy::new(GaggleConfig::from_env);
1010

test/sql/test_gaggle_errors.test

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# group: [gaggle]
2+
3+
# Error and argument validation tests for Gaggle
4+
5+
statement ok
6+
pragma enable_verification
7+
8+
# Load the Gaggle extension
9+
statement ok
10+
load 'build/release/extension/gaggle/gaggle.duckdb_extension'
11+
12+
# Wrong arity for gaggle_download should error
13+
statement error
14+
select gaggle_download('a','b')
15+
----
16+
No function matches the given name
17+
18+
# NULL input returns NULL by default (DuckDB default null-handling)
19+
query I
20+
select gaggle_info(NULL) is null
21+
----
22+
1
23+
24+
# Invalid dataset path format should error
25+
statement error
26+
select gaggle_download('owner/too/many/components')
27+
----
28+
Failed
29+
30+
# Replacement scan with invalid kaggle URL should error (no slash)
31+
statement error
32+
select * from 'kaggle:invalid' limit 1
33+
----
34+
kaggle:invalid

0 commit comments

Comments
 (0)