Skip to content

Commit 0126a4f

Browse files
committed
WIP
1 parent e590e4e commit 0126a4f

File tree

18 files changed

+742
-100
lines changed

18 files changed

+742
-100
lines changed

README.md

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,14 @@ frequently.
3131
Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside
3232
an analytical database like DuckDB that can handle fast queries.
3333

34-
> [!NOTE]
35-
> Gaggle is similar of [Hugging Face extension]() for DuckDB.
36-
> Although, Kaggle datasets are not directly accessible from a remote file system, like Hugging Face datasets.
37-
3834
### Features
3935

40-
- Has a simple API (just a handful of SQL functions)
41-
- Allows you search, download, update, and delete Kaggle datasets directly from DuckDB
42-
- Supports datasets made of CSV, JSON, and Parquet files
36+
- Has a simple API to interact with Kaggle datasets from DuckDB
37+
- Allows you to search, download, and read datasets from Kaggle
38+
- Supports datasets that contain CSV, Parquet, JSON, and XLSX files
4339
- Configurable and has built-in caching support
44-
- Thread-safe and memory-efficient
40+
- Thread-safe, fast, and has a low memory footprint
41+
- Supports dataset versioning and update checks
4542

4643
See the [ROADMAP.md](ROADMAP.md) for planned features and the [docs](docs) folder for detailed documentation.
4744

@@ -94,35 +91,34 @@ make release
9491
#### Trying Gaggle
9592

9693
```sql
97-
-- Load the Gaggle extension
98-
load 'build/release/extension/gaggle/gaggle.duckdb_extension';
94+
-- Load the Gaggle extension (only needed if you built from source)
95+
--load 'build/release/extension/gaggle/gaggle.duckdb_extension';
9996

100-
-- Set your Kaggle credentials (or use `~/.kaggle/kaggle.json`)
97+
-- Manually, set your Kaggle credentials (or use `~/.kaggle/kaggle.json`)
10198
select gaggle_set_credentials('your-username', 'your-api-key');
10299

103100
-- Get extension version
104101
select gaggle_version();
105102

106-
-- Prime cache by downloading the dataset locally (optional, but improves first-time performance)
107-
select gaggle_download('habedi/flickr-8k-dataset-clean');
108-
109103
-- List files in the downloaded dataset
104+
-- (Note that if the datasets is not downloaded yet, it will be downloaded and cached first)
110105
select *
111106
from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
112107

113-
-- Read a Parquet file from local cache using a prepared statement (no subquery in function args)
108+
-- Read a Parquet file from local cache using a prepared statement
109+
-- (Note that DuckDB doesn't support subquery in function arguments, so we use a prepared statement)
114110
prepare rp as select * from read_parquet(?) limit 10;
115111
execute rp(gaggle_file_paths('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));
116112

117-
-- Use the replacement scan to read directly via kaggle: URL
113+
-- Alternatively, we can use a replacement scan to read directly via `kaggle:` prefix
118114
select count(*)
119115
from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
120116

121117
-- Or glob Parquet files in a dataset directory
122118
select count(*)
123119
from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet';
124120

125-
-- Optionally, check cache info
121+
-- Optionally, we check cache info
126122
select gaggle_cache_info();
127123

128124
-- Enforce cache size limit manually (automatic with soft limit by default)

ROADMAP.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ It outlines features to be implemented and their current status.
3939
* [x] CSV and TSV file reading.
4040
* [x] Parquet file reading.
4141
* [x] JSON file reading.
42-
* [ ] Excel and XLSX file reading.
42+
* [ ] Excel (XLSX) file reading.
4343
* **Querying Datasets**
4444
* [x] Replacement scan for `kaggle:` URLs.
4545
* [ ] Virtual table support for lazy loading.
@@ -49,7 +49,7 @@ It outlines features to be implemented and their current status.
4949
* **Concurrency Control**
5050
* [x] Thread-safe credential storage.
5151
* [x] Thread-safe cache access.
52-
* [x] Concurrent dataset downloads (with per-dataset serialization to prevent race conditions).
52+
* [x] Concurrent dataset downloads.
5353
* **Network Optimization**
5454
* [x] Configurable HTTP timeouts.
5555
* [x] Retry logic with backoff for failed requests.
@@ -80,8 +80,13 @@ It outlines features to be implemented and their current status.
8080
* **Testing**
8181
* [x] Unit tests for core modules (Rust).
8282
* [x] SQL integration tests (DuckDB shell).
83-
* [ ] End-to-end integration tests with mocked HTTP.
83+
* [x] End-to-end integration tests with mocked HTTP (basic coverage).
8484
* [ ] Performance benchmarks.
8585
* **Distribution**
8686
* [ ] Pre-compiled extension binaries for Linux, macOS, and Windows.
8787
* [ ] Submission to the DuckDB Community Extensions repository.
88+
89+
### 7. Observability
90+
91+
* **Logging**
92+
* [x] Structured logging via `tracing` with `GAGGLE_LOG_LEVEL`.

docs/CONFIGURATION.md

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,28 @@ Gaggle supports configuration via environment variables to customize its behavio
8888

8989
These controls enable exponential backoff with cap across metadata/search/download requests.
9090

91+
#### Download Coordination
92+
93+
When multiple queries attempt to download the same dataset concurrently, Gaggle coordinates using an in-process lock.
94+
These settings control the wait behavior when a download is already in progress.
95+
96+
- **GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS**
97+
- **Description**: Maximum time a waiting request will block for a concurrent download to finish
98+
- **Type**: Integer (milliseconds)
99+
- **Default**: `30000` (30 seconds)
100+
- **Example**:
101+
```bash
102+
export GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS=600000 # 10 minutes
103+
```
104+
- **GAGGLE_DOWNLOAD_WAIT_POLL_MS**
105+
- **Description**: Polling interval while waiting on another download
106+
- **Type**: Integer (milliseconds)
107+
- **Default**: `100`
108+
- **Example**:
109+
```bash
110+
export GAGGLE_DOWNLOAD_WAIT_POLL_MS=250
111+
```
112+
91113
#### Logging Configuration
92114

93115
##### GAGGLE_VERBOSE
@@ -100,24 +122,36 @@ Gaggle supports configuration via environment variables to customize its behavio
100122
export GAGGLE_VERBOSE=1
101123
```
102124

103-
##### GAGGLE_LOG_LEVEL (planned)
125+
##### GAGGLE_LOG_LEVEL
104126

105-
- **Description**: Set logging level for detailed output
106-
- **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`)
127+
- **Description**: Set logging level for structured logs emitted by the Rust core (via `tracing`)
128+
- **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`); case-insensitive
107129
- **Default**: `WARN`
108-
- **Status**: Planned, not implemented yet
130+
- **Status**: ✅ Implemented
109131
- **Example**:
110132
```bash
111-
## Show all messages including debug
112-
export GAGGLE_LOG_LEVEL=DEBUG
113-
114-
## Show only errors
115-
export GAGGLE_LOG_LEVEL=ERROR
116-
117-
## Show informational messages and above
118133
export GAGGLE_LOG_LEVEL=INFO
119134
```
120135

136+
Notes:
137+
- Logging is initialized lazily on first use (when the crate is loaded in-process or when `gaggle::init_logging()` is called). The environment variable is read once per process.
138+
- Logs include a level prefix and optional ANSI colors if stderr is a terminal.
139+
140+
#### Offline Mode
141+
142+
- **GAGGLE_OFFLINE**
143+
- **Description**: Disable network access. When enabled, operations that require network will fail fast unless data is already cached.
144+
- **Type**: Boolean (`1`, `true`, `yes`, `on` to enable)
145+
- **Default**: `false`
146+
- **Effects**:
147+
- gaggle_download(...) fails if the dataset isn’t cached.
148+
- Version checks use cached `.downloaded` metadata when available; otherwise return "unknown".
149+
- Search and metadata calls will still attempt network; consider avoiding them in offline mode.
150+
- **Example**:
151+
```bash
152+
export GAGGLE_OFFLINE=1
153+
```
154+
121155
### Usage Examples
122156
123157
#### Example 1: Custom Cache Directory
@@ -189,6 +223,19 @@ export GAGGLE_HTTP_RETRY_MAX_DELAY_MS=60000 ## Cap at 60s
189223
./build/release/duckdb
190224
```
191225
226+
#### Example 6: Offline Mode
227+
228+
```bash
229+
# Enable offline mode
230+
export GAGGLE_OFFLINE=1
231+
232+
# Attempt to download a dataset (will fail if not cached)
233+
gaggle download username/dataset-name
234+
235+
# Querying metadata or searching will still attempt network access
236+
gaggle info username/dataset-name
237+
```
238+
192239
### Configuration Verification
193240
194241
You can verify your configuration at runtime:

docs/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,13 @@ Table function:
3333

3434
Replacement scan (transparent table read):
3535

36-
- Single file: `'kaggle:owner/dataset/file.parquet'`
37-
- Glob: `'kaggle:owner/dataset/*.parquet'`
38-
- Parquet paths use DuckDB’s `read_parquet`; other files default to `read_csv_auto`.
36+
- Single file: `'kaggle:owner/dataset/file.ext'`
37+
- Glob: `'kaggle:owner/dataset/*.ext'`
38+
- Reader is chosen by extension:
39+
- `.parquet`/`.parq` -> `read_parquet`
40+
- `.json`/`.jsonl`/`.ndjson` -> `read_json_auto`
41+
- `.xlsx` -> `read_excel`
42+
- everything else -> `read_csv_auto`
3943

4044
---
4145

gaggle/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json", "mul
2222
zip = "6.0"
2323
dirs = "6.0"
2424
urlencoding = "2.1"
25+
tracing = "0.1"
26+
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
27+
atty = "0.2"
2528

2629
[dev-dependencies]
2730
tempfile = "3.10"

gaggle/bindings/gaggle_extension.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,23 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input,
511511
pattern.find('*') != string::npos || pattern.find('?') != string::npos;
512512
bool is_dir = pattern.empty();
513513

514+
auto decide_reader = [](const string &lower_ext) -> string {
515+
if (StringUtil::EndsWith(lower_ext, ".parquet") ||
516+
StringUtil::EndsWith(lower_ext, ".parq")) {
517+
return "read_parquet";
518+
}
519+
if (StringUtil::EndsWith(lower_ext, ".json") ||
520+
StringUtil::EndsWith(lower_ext, ".jsonl") ||
521+
StringUtil::EndsWith(lower_ext, ".ndjson")) {
522+
return "read_json_auto";
523+
}
524+
if (StringUtil::EndsWith(lower_ext, ".xlsx")) {
525+
return "read_excel";
526+
}
527+
// Default CSV/TSV and others to DuckDB's auto CSV reader
528+
return "read_csv_auto";
529+
};
530+
514531
if (is_dir || has_wildcard) {
515532
// Ensure dataset is downloaded and construct a glob path
516533
char *dir_c = gaggle_download_dataset(dataset_path.c_str());
@@ -525,10 +542,7 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input,
525542
local_path = dir_path + tail;
526543

527544
// Choose reader based on pattern extension if any
528-
if (StringUtil::EndsWith(lower_pat, ".parquet") ||
529-
StringUtil::EndsWith(lower_pat, ".parq")) {
530-
func_name = "read_parquet";
531-
}
545+
func_name = decide_reader(lower_pat);
532546
} else {
533547
// Specific file: resolve exact path
534548
char *file_path_c =
@@ -551,10 +565,7 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input,
551565

552566
// Decide reader based on extension
553567
auto lower_name = StringUtil::Lower(pattern);
554-
if (StringUtil::EndsWith(lower_name, ".parquet") ||
555-
StringUtil::EndsWith(lower_name, ".parq")) {
556-
func_name = "read_parquet";
557-
}
568+
func_name = decide_reader(lower_name);
558569
}
559570

560571
// Construct a table function call: func_name(local_path)

gaggle/src/config.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ pub struct GaggleConfig {
1919
/// HTTP timeout in seconds
2020
#[allow(dead_code)]
2121
pub http_timeout_secs: u64,
22+
/// Download lock wait timeout in milliseconds
23+
#[allow(dead_code)]
24+
pub download_wait_timeout_ms: u64,
25+
/// Download lock poll interval in milliseconds
26+
#[allow(dead_code)]
27+
pub download_wait_poll_ms: u64,
2228
// Future: other options
2329
}
2430

@@ -29,6 +35,8 @@ impl GaggleConfig {
2935
cache_dir: Self::get_cache_dir(),
3036
verbose_logging: Self::get_verbose(),
3137
http_timeout_secs: Self::get_http_timeout(),
38+
download_wait_timeout_ms: Self::get_download_wait_timeout_ms(),
39+
download_wait_poll_ms: Self::get_download_wait_poll_ms(),
3240
}
3341
}
3442

@@ -65,6 +73,22 @@ impl GaggleConfig {
6573
.and_then(|v| v.parse().ok())
6674
.unwrap_or(30)
6775
}
76+
77+
/// Get download wait timeout from env (default 30_000 ms)
78+
fn get_download_wait_timeout_ms() -> u64 {
79+
env::var("GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS")
80+
.ok()
81+
.and_then(|v| v.parse().ok())
82+
.unwrap_or(30_000)
83+
}
84+
85+
/// Get download wait poll interval from env (default 100 ms)
86+
fn get_download_wait_poll_ms() -> u64 {
87+
env::var("GAGGLE_DOWNLOAD_WAIT_POLL_MS")
88+
.ok()
89+
.and_then(|v| v.parse().ok())
90+
.unwrap_or(100)
91+
}
6892
}
6993

7094
impl Default for GaggleConfig {
@@ -154,6 +178,33 @@ pub fn cache_limit_is_soft() -> bool {
154178
.unwrap_or(true)
155179
}
156180

181+
/// Runtime-resolved download wait timeout in milliseconds
182+
pub fn download_wait_timeout_ms() -> u64 {
183+
env::var("GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS")
184+
.ok()
185+
.and_then(|v| v.parse().ok())
186+
.unwrap_or(CONFIG.download_wait_timeout_ms)
187+
}
188+
189+
/// Runtime-resolved download wait poll interval in milliseconds
190+
pub fn download_wait_poll_interval_ms() -> u64 {
191+
env::var("GAGGLE_DOWNLOAD_WAIT_POLL_MS")
192+
.ok()
193+
.and_then(|v| v.parse().ok())
194+
.unwrap_or(CONFIG.download_wait_poll_ms)
195+
}
196+
197+
/// Whether offline mode is enabled (disables network operations)j Controlled by GAGGLE_OFFLINE
198+
pub fn offline_mode() -> bool {
199+
std::env::var("GAGGLE_OFFLINE")
200+
.ok()
201+
.map(|v| match v.to_lowercase().as_str() {
202+
"1" | "true" | "yes" | "on" => true,
203+
_ => false,
204+
})
205+
.unwrap_or(false)
206+
}
207+
157208
#[cfg(test)]
158209
mod tests {
159210
use super::*;
@@ -165,6 +216,8 @@ mod tests {
165216
let config = GaggleConfig::default();
166217
assert!(!config.verbose_logging);
167218
assert_eq!(config.http_timeout_secs, 30);
219+
assert!(config.download_wait_timeout_ms >= 1000);
220+
assert!(config.download_wait_poll_ms > 0);
168221
}
169222

170223
#[test]
@@ -453,4 +506,29 @@ mod tests {
453506
assert!(!cache_limit_is_soft());
454507
env::remove_var("GAGGLE_CACHE_HARD_LIMIT");
455508
}
509+
510+
#[test]
511+
#[serial]
512+
fn test_download_wait_runtime_overrides() {
513+
env::set_var("GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS", "1234");
514+
env::set_var("GAGGLE_DOWNLOAD_WAIT_POLL_MS", "17");
515+
assert_eq!(download_wait_timeout_ms(), 1234);
516+
assert_eq!(download_wait_poll_interval_ms(), 17);
517+
env::remove_var("GAGGLE_DOWNLOAD_WAIT_TIMEOUT_MS");
518+
env::remove_var("GAGGLE_DOWNLOAD_WAIT_POLL_MS");
519+
}
520+
521+
#[test]
522+
#[serial]
523+
fn test_offline_mode_env_parsing() {
524+
std::env::remove_var("GAGGLE_OFFLINE");
525+
assert!(!offline_mode());
526+
std::env::set_var("GAGGLE_OFFLINE", "1");
527+
assert!(offline_mode());
528+
std::env::set_var("GAGGLE_OFFLINE", "true");
529+
assert!(offline_mode());
530+
std::env::set_var("GAGGLE_OFFLINE", "no");
531+
assert!(!offline_mode());
532+
std::env::remove_var("GAGGLE_OFFLINE");
533+
}
456534
}

0 commit comments

Comments
 (0)