Skip to content

Commit c8320d1

Browse files
committed
Add more Sqllogictest-style tests
1 parent be82e81 commit c8320d1

23 files changed

+368
-171
lines changed

CMakeLists.txt

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
cmake_minimum_required(VERSION 3.5)
22

33
set(CORROSION_VERBOSE_OUTPUT ON)
4-
set(CMAKE_CXX_STANDARD 11)
5-
set(CMAKE_CXX_STANDARD_REQUIRED 1)
4+
5+
# We need C++17 for std::filesystem on all platforms
6+
set(CMAKE_CXX_STANDARD 17)
7+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
68

79

810
# --- Fallback platform detection (only if DuckDB build system did not set these) ---
@@ -222,3 +224,20 @@ if(TARGET ${EXTENSION_NAME})
222224
else()
223225
message(STATUS "[gaggle] No C++ extension target built; skipping install directive.")
224226
endif()
227+
228+
# Link filesystem library on older libstdc++ (GCC < 9 requires -lstdc++fs)
229+
set(_GAGGLE_FS_LIB "")
230+
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
231+
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
232+
set(_GAGGLE_FS_LIB stdc++fs)
233+
endif()
234+
endif()
235+
236+
if(_GAGGLE_FS_LIB)
237+
if (TARGET ${EXTENSION_NAME})
238+
target_link_libraries(${EXTENSION_NAME} ${_GAGGLE_FS_LIB})
239+
endif()
240+
if (TARGET ${LOADABLE_EXTENSION_NAME})
241+
target_link_libraries(${LOADABLE_EXTENSION_NAME} ${_GAGGLE_FS_LIB})
242+
endif()
243+
endif()

README.md

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,34 +12,34 @@
1212
[![Docs](https://img.shields.io/badge/docs-read-blue?style=flat&labelColor=282c34&logo=read-the-docs)](https://github.com/CogitatorTech/gaggle/tree/main/docs)
1313
[![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-007ec6?style=flat&labelColor=282c34&logo=open-source-initiative)](https://github.com/CogitatorTech/gaggle)
1414

15-
Kaggle Datasets for DuckDB
15+
Access and Query Kaggle Datasets from DuckDB
1616

1717
</div>
1818

1919
---
2020

2121
Gaggle is a DuckDB extension that allows you to work with Kaggle datasets directly in SQL queries, as if
2222
they were DuckDB tables.
23-
It is written in Rust and uses the Kaggle API to search, download, and manage datasets.
23+
It is written in Rust and uses the Kaggle API to search, download, and manage the datasets.
2424

2525
Kaggle hosts a large collection of very useful datasets for data science and machine learning.
2626
Accessing these datasets typically involves manually downloading a dataset (as a ZIP file),
2727
extracting it, loading the files in the dataset into your data science environment, and managing storage and dataset
2828
updates, etc.
29-
This workflow can be become complex, especially when working with multiple datasets or when datasets are updated
29+
This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated
3030
frequently.
3131
Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside
3232
an analytical database like DuckDB that can handle fast queries.
3333
In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets.
3434

3535
### Features
3636

37-
- Has a simple API to interact with Kaggle datasets from DuckDB
37+
- Provides a simple API to interact with Kaggle datasets from DuckDB
3838
- Allows you to search, download, and read datasets from Kaggle
39-
- Supports datasets that contain CSV, Parquet, JSON, and XLSX files (XLSX requires DuckDB's Excel reader to be available in your DuckDB build)
40-
- Configurable and has built-in caching support
39+
- Supports datasets that contain CSV, Parquet, JSON, and XLSX files
40+
- Configurable and has built-in caching of downloaded datasets
4141
- Thread-safe, fast, and has a low memory footprint
42-
- Supports dataset versioning and update checks
42+
- Supports dataset updates and versioning
4343

4444
See the [ROADMAP.md](ROADMAP.md) for planned features and the [docs](docs) folder for detailed documentation.
4545

@@ -101,8 +101,8 @@ select gaggle_set_credentials('your-username', 'your-api-key');
101101
-- Get extension version
102102
select gaggle_version();
103103

104-
-- List files in the downloaded dataset
105-
-- (Note that if the datasets is not downloaded yet, it will be downloaded and cached first)
104+
-- List files in the dataset
105+
-- (Note that if the datasets is not downloaded, it will be downloaded and cached automatically)
106106
select *
107107
from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
108108

@@ -150,19 +150,6 @@ Check out the [examples](docs/examples) directory for SQL scripts that show how
150150

151151
---
152152

153-
### Configuration
154-
155-
See [CONFIGURATION.md](docs/CONFIGURATION.md) for full details. Main environment variables:
156-
157-
- `GAGGLE_CACHE_DIR` — cache directory path (default: `~/.cache/gaggle`)
158-
- `GAGGLE_HTTP_TIMEOUT` — HTTP timeout (in seconds)
159-
- `GAGGLE_HTTP_RETRY_ATTEMPTS` — retry attempts after the initial try
160-
- `GAGGLE_HTTP_RETRY_DELAY_MS` — initial backoff delay (in milliseconds)
161-
- `GAGGLE_HTTP_RETRY_MAX_DELAY_MS` — maximum backoff delay cap (in milliseconds)
162-
- `GAGGLE_LOG_LEVEL` — structured log level for the Rust core (like `INFO` or `DEBUG`)
163-
- `GAGGLE_OFFLINE` — disable network; only use cached data (downloads fail fast if not cached)
164-
- `KAGGLE_USERNAME`, `KAGGLE_KEY` — Kaggle credentials (alternative to the SQL call)
165-
166153
### Contributing
167154

168155
See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to make a contribution.

ROADMAP.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ It outlines features to be implemented and their current status.
3939
* [x] CSV and TSV file reading.
4040
* [x] Parquet file reading.
4141
* [x] JSON file reading.
42-
* [ ] Excel (XLSX) file reading. (Available when DuckDB is built with the Excel reader; replacement scan routes `.xlsx` to `read_excel`.)
42+
* [x] Excel (XLSX) file reading.
4343
* **Querying Datasets**
4444
* [x] Replacement scan for `kaggle:` URLs.
4545
* [ ] Virtual table support for lazy loading.
@@ -66,8 +66,8 @@ It outlines features to be implemented and their current status.
6666
* [x] Detailed error codes for programmatic error handling.
6767
* **Resilience**
6868
* [x] Automatic retry on network failures.
69-
* [ ] Graceful degradation when Kaggle API is unavailable.
7069
* [x] Local-only mode for cached datasets (via `GAGGLE_OFFLINE`).
70+
* [ ] Graceful degradation when Kaggle API is unavailable.
7171

7272
### 6. Documentation and Distribution
7373

docs/CONFIGURATION.md

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Gaggle supports configuration via environment variables to customize its behavio
1010

1111
- **Description**: Directory path for caching downloaded Kaggle datasets
1212
- **Type**: String (path)
13-
- **Default**: `$XDG_CACHE_HOME/gaggle_cache` (typically `~/.cache/gaggle_cache`)
13+
- **Default**: `$XDG_CACHE_HOME/gaggle` (typically `~/.cache/gaggle`)
1414
- **Example**:
1515
```bash
1616
export GAGGLE_CACHE_DIR="/var/cache/gaggle"
@@ -77,38 +77,34 @@ Gaggle supports configuration via environment variables to customize its behavio
7777
- **Description**: Number of retry attempts after the initial try
7878
- **Type**: Integer
7979
- **Default**: `3`
80-
- **GAGGLE_HTTP_RETRY_DELAY_MS**
81-
- **Description**: Initial backoff delay in milliseconds
82-
- **Type**: Integer (ms)
83-
- **Default**: `1000`
84-
- **GAGGLE_HTTP_RETRY_MAX_DELAY_MS**
85-
- **Description**: Maximum backoff delay cap in milliseconds
86-
- **Type**: Integer (ms)
87-
- **Default**: `30000`
80+
- **GAGGLE_HTTP_RETRY_DELAY**
81+
- **Description**: Initial backoff delay in seconds
82+
- **Type**: Float or integer (seconds)
83+
- **Default**: `1`
84+
- **GAGGLE_HTTP_RETRY_MAX_DELAY**
85+
- **Description**: Maximum backoff delay cap in seconds
86+
- **Type**: Float or integer (seconds)
87+
- **Default**: `30`
8888

89-
These controls enable exponential backoff with cap across metadata/search/download requests.
89+
These controls enable exponential backoff with cap across metadata/search/download requests.
9090

9191
#### Download Coordination
9292

9393
When multiple queries attempt to download the same dataset concurrently, Gaggle coordinates using an in-process lock.
9494
These settings control the wait behavior when a download is already in progress.
9595

96-
- **GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS**
97-
- **Description**: Maximum time a waiting request will block for a concurrent download to finish
98-
- **Type**: Integer (milliseconds)
99-
- **Default**: `30000` (30 seconds)
96+
- **GAGGLE_DOWNLOAD_WAIT_TIMEOUT**
97+
- **Description**: Maximum time a waiting request will block (seconds)
98+
- **Type**: Float or integer (seconds)
99+
- **Default**: `30`
100100
- **Example**:
101101
```bash
102-
export GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS=600000 # 10 minutes
103-
```
104-
- **GAGGLE_DOWNLOAD_WAIT_POLL_MS**
105-
- **Description**: Polling interval while waiting on another download
106-
- **Type**: Integer (milliseconds)
107-
- **Default**: `100`
108-
- **Example**:
109-
```bash
110-
export GAGGLE_DOWNLOAD_WAIT_POLL_MS=250
102+
export GAGGLE_DOWNLOAD_WAIT_TIMEOUT=600 # 10 minutes
111103
```
104+
- **GAGGLE_DOWNLOAD_WAIT_POLL**
105+
- **Description**: Polling interval while waiting (seconds)
106+
- **Type**: Float or integer (seconds)
107+
- **Default**: `0.1`
112108

113109
#### Logging Configuration
114110

@@ -144,9 +140,10 @@ These settings control the wait behavior when a download is already in progress.
144140
- **Type**: Boolean (`1`, `true`, `yes`, `on` to enable)
145141
- **Default**: `false`
146142
- **Effects**:
147-
- gaggle_download(...) fails if the dataset isn’t cached.
148-
- Version checks use cached `.downloaded` metadata when available; otherwise return "unknown".
149-
- Search and metadata calls will still attempt network; consider avoiding them in offline mode.
143+
- `gaggle_download(...)` fails if the dataset isn’t cached.
144+
- `gaggle_version_info` reports `latest_version` as "unknown" if no cache metadata exists.
145+
- `gaggle_is_current` and other version checks use cached `.downloaded` metadata when available.
146+
- `gaggle_search` and `gaggle_info` also fail fast in offline mode (no network attempts).
150147
- **Example**:
151148
```bash
152149
export GAGGLE_OFFLINE=1
@@ -185,9 +182,9 @@ export GAGGLE_CACHE_DIR="/var/lib/gaggle/cache"
185182
export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 # 50GB
186183
export GAGGLE_HTTP_TIMEOUT=120 # 2 minutes
187184
export GAGGLE_HTTP_RETRY_ATTEMPTS=5 # Retry up to 5 times
188-
export GAGGLE_HTTP_RETRY_DELAY_MS=2000 # 2 second initial delay
189-
export GAGGLE_HTTP_RETRY_MAX_DELAY_MS=30000 # Cap backoff at 30s
190-
export GAGGLE_LOG_LEVEL=WARN # Production logging (planned)
185+
export GAGGLE_HTTP_RETRY_DELAY=2 # 2 second initial delay
186+
export GAGGLE_HTTP_RETRY_MAX_DELAY=30 # Cap backoff at 30s
187+
export GAGGLE_LOG_LEVEL=WARN # Production logging
191188
192189
## Set Kaggle credentials
193190
export KAGGLE_USERNAME="your-username"
@@ -202,10 +199,10 @@ export KAGGLE_KEY="your-api-key"
202199
```bash
203200
## Development setup with verbose logging
204201
export GAGGLE_CACHE_DIR="./dev-cache"
205-
export GAGGLE_LOG_LEVEL=DEBUG ## Detailed debug logs (planned)
202+
export GAGGLE_LOG_LEVEL=DEBUG ## Detailed debug logs
206203
export GAGGLE_HTTP_TIMEOUT=10 ## Shorter timeout for dev
207204
export GAGGLE_HTTP_RETRY_ATTEMPTS=1 ## Fail fast in development
208-
export GAGGLE_HTTP_RETRY_DELAY_MS=250 ## Quick retry
205+
export GAGGLE_HTTP_RETRY_DELAY=0.25 ## Quick retry (250ms)
209206
210207
## Run DuckDB
211208
./build/release/duckdb
@@ -217,8 +214,8 @@ export GAGGLE_HTTP_RETRY_DELAY_MS=250 ## Quick retry
217214
## Configuration for slow or unreliable networks
218215
export GAGGLE_HTTP_TIMEOUT=300 ## 5 minute timeout
219216
export GAGGLE_HTTP_RETRY_ATTEMPTS=10 ## Many retries
220-
export GAGGLE_HTTP_RETRY_DELAY_MS=5000 ## 5 second initial delay
221-
export GAGGLE_HTTP_RETRY_MAX_DELAY_MS=60000 ## Cap at 60s
217+
export GAGGLE_HTTP_RETRY_DELAY=5 ## 5 second initial delay
218+
export GAGGLE_HTTP_RETRY_MAX_DELAY=60 ## Cap at 60s
222219
223220
./build/release/duckdb
224221
```
@@ -230,10 +227,11 @@ export GAGGLE_HTTP_RETRY_MAX_DELAY_MS=60000 ## Cap at 60s
230227
export GAGGLE_OFFLINE=1
231228
232229
# Attempt to download a dataset (will fail if not cached)
233-
gaggle download username/dataset-name
230+
SELECT gaggle_download('username/dataset-name');
234231
235-
# Querying metadata or searching will still attempt network access
236-
gaggle info username/dataset-name
232+
# Querying metadata or searching will fail fast in offline mode
233+
SELECT gaggle_info('username/dataset-name');
234+
SELECT gaggle_search('keyword', 1, 10);
237235
```
238236
239237
### Configuration Verification
@@ -253,21 +251,27 @@ SELECT gaggle_search('housing', 1, 10);
253251
254252
-- Get dataset metadata
255253
SELECT gaggle_info('username/dataset-name');
254+
255+
-- Retrieve last error string (or NULL if none)
256+
SELECT gaggle_last_error();
256257
```
257258
258259
### Retry Policy Details
259260
260261
Gaggle implements retries with exponential backoff for HTTP requests. The number of attempts, initial delay, and
261262
maximum delay can be tuned with the environment variables above.
262263
263-
### Logging Levels (planned)
264+
### Logging Levels
264265
265-
Detailed logging control via `GAGGLE_LOG_LEVEL` is planned but not yet implemented.
266+
Detailed logging control via `GAGGLE_LOG_LEVEL` is implemented.
266267
267-
### Notes
268+
### Units
268269
269-
- Cache directory and HTTP timeout are checked at runtime. Changing `GAGGLE_CACHE_DIR` or `GAGGLE_HTTP_TIMEOUT` takes
270-
effect for subsequent operations in the same process.
271-
- Kaggle credentials can be provided via environment variables, config file, or the `gaggle_set_credentials()` SQL
272-
function.
273-
- Invalid values fall back to sensible defaults.
270+
- Storage sizes are reported in megabytes (MB) throughout the API and SQL functions.
271+
- Timeouts and retry delays are configured in seconds via environment variables with clean names (no unit suffixes). For example: `GAGGLE_HTTP_RETRY_DELAY=1.5`.
272+
273+
```sql
274+
-- Example cache info (note size is in MB only)
275+
SELECT gaggle_cache_info();
276+
-- {"path":"...","size_mb":42,"limit_mb":102400,"usage_percent":0,"is_soft_limit":true,"type":"local"}
277+
```

docs/ERROR_CODES.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ Network error occurred during communication with Kaggle API.
159159
4. **Retry with backoff:**
160160
```bash
161161
export GAGGLE_HTTP_RETRY_ATTEMPTS=5
162-
export GAGGLE_HTTP_RETRY_DELAY_MS=2000
162+
export GAGGLE_HTTP_RETRY_DELAY=2
163+
export GAGGLE_HTTP_RETRY_MAX_DELAY=30
163164
```
164165
165166
5. **Check firewall settings:**

docs/README.md

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The table below includes the information about all SQL functions exposed by Gagg
1616
| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. |
1717
| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. |
1818
| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object/array into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. |
19-
| 13 | `gaggle_file_paths(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
19+
| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
2020

2121
> [!NOTE]
2222
> Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on
@@ -188,15 +188,20 @@ See [CONFIGURATION.md](CONFIGURATION.md) for full details. Key environment varia
188188
- `GAGGLE_CACHE_DIR` — cache directory path (default: `~/.cache/gaggle`)
189189
- `GAGGLE_HTTP_TIMEOUT` — HTTP timeout (in seconds)
190190
- `GAGGLE_HTTP_RETRY_ATTEMPTS` — retry attempts after the initial try
191-
- `GAGGLE_HTTP_RETRY_DELAY_MS` — initial backoff delay (in milliseconds)
192-
- `GAGGLE_HTTP_RETRY_MAX_DELAY_MS` — maximum backoff delay cap (in milliseconds)
191+
- `GAGGLE_HTTP_RETRY_DELAY` — initial backoff delay (in seconds)
192+
- `GAGGLE_HTTP_RETRY_MAX_DELAY` — maximum backoff delay cap (in seconds)
193193
- `GAGGLE_LOG_LEVEL` — structured log level for the Rust core (e.g., `INFO`, `DEBUG`)
194194
- `GAGGLE_OFFLINE` — disable network; only use cached data (downloads fail fast if not cached)
195-
- `KAGGLE_USERNAME`, `KAGGLE_KEY` — Kaggle credentials (alternative to the SQL call)
195+
- `KAGGLE_USERNAME` and `KAGGLE_KEY` — Kaggle credentials (alternative to the SQL call)
196196

197197
> [!NOTE]
198198
> Environment variables are case-sensitive on Unix-like systems. Changes take effect for subsequent operations in the same process.
199199
200+
#### Units
201+
202+
- Storage sizes are reported in megabytes (MB) across SQL/API (for example: `gaggle_cache_info()` returns `size_mb`).
203+
- Timeouts and retry delays are configured in seconds (via clean environment variables without unit suffixes).
204+
200205
### Replacement Scan Readers
201206

202207
Gaggle selects the DuckDB reader based on file extension:
@@ -220,6 +225,6 @@ Gaggle is made up of two main components:
220225
- C-compatible FFI surface
221226

222227
2. **C++ DuckDB Bindings (`gaggle/bindings/`)** that:
223-
- Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_paths`, `gaggle_search`)
228+
- Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_path`, `gaggle_search`)
224229
- Integrates with DuckDB’s extension system and replacement scans (`'kaggle:...'`)
225230
- Marshals values between DuckDB vectors and the Rust FFI

docs/examples/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ Each file is self‑contained and can be executed in the DuckDB shell (or via `d
2323
```bash
2424
make examples
2525
```
26+
27+
> [!NOTE]
28+
> Some operations (like search and download) need network access unless `GAGGLE_OFFLINE=1`.
29+
> When offline, these will fail fast if data is not cached locally (not downloaded already).

docs/examples/e1_core_functionality.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,22 @@ limit 5;
2626

2727
-- Section 4: download a dataset
2828
select '## Download a dataset';
29-
select gaggle_download('owid/covid-latest-data') as download_path;
29+
select gaggle_download('uciml/iris') as download_path;
3030

3131
-- Section 5: list files (JSON)
3232
select '## list files (json)';
3333
select to_json(
3434
list(struct_pack(name := name, size := size, path := path))
3535
) as files_json
36-
from gaggle_ls('owid/covid-latest-data');
36+
from gaggle_ls('uciml/iris');
3737

3838
-- Section 5b: list files (table)
3939
select '## list files (table)';
40-
select * from gaggle_ls('owid/covid-latest-data') limit 5;
40+
select * from gaggle_ls('uciml/iris') limit 5;
4141

4242
-- Section 6: get dataset metadata
4343
select '## get dataset metadata';
44-
select gaggle_info('owid/covid-latest-data') as dataset_metadata;
44+
select gaggle_info('uciml/iris') as dataset_metadata;
4545

4646
-- Section 7: get cache information
4747
select '## Get cache information';

0 commit comments

Comments
 (0)