diff --git a/.env.example b/.env.example index b07b431..e5f6034 100644 --- a/.env.example +++ b/.env.example @@ -2,4 +2,17 @@ AMAZON_EMAIL= AMAZON_PASSWORD= ASIN= +# Option 1: Using OpenAI (default) +AI_PROVIDER=openai OPENAI_API_KEY= + +# Option 2: Using Ollama (Local/Self-hosted) +# Uncomment these lines and comment out OPENAI_API_KEY to use Ollama +# AI_PROVIDER=ollama +# OLLAMA_BASE_URL=http://localhost:11434 +# OLLAMA_VISION_MODEL=qwen2.5vl:7b # For vision/OCR tasks +# OLLAMA_MODEL=llama3.2 # For text cleaning tasks +# OLLAMA_CONCURRENCY=16 + +# Option 3: Using Tesseract.js OCR (free, local, no API needed) +# OCR_CONCURRENCY=4 # Number of parallel workers (default: 4) diff --git a/package.json b/package.json index f3b7f06..d5ac742 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "@inquirer/prompts": "^7.0.0", "dotenv": "^17.2.3", "fluent-ffmpeg": "^2.1.3", + "globby": "^14.0.2", "hash-object": "^5.0.1", "hh-mm-ss": "^1.2.0", "ky": "^1.12.0", @@ -39,6 +40,7 @@ "sharp": "^0.34.4", "tar": "^7.5.1", "tempy": "^3.1.0", + "tesseract.js": "^5.1.1", "type-fest": "^5.1.0", "unrealspeech-api": "^1.0.2" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b21fb1f..58a6e56 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: fluent-ffmpeg: specifier: ^2.1.3 version: 2.1.3 + globby: + specifier: ^14.0.2 + version: 14.1.0 hash-object: specifier: ^5.0.1 version: 5.0.1 @@ -59,6 +62,9 @@ importers: tempy: specifier: ^3.1.0 version: 3.1.0 + tesseract.js: + specifier: ^5.1.1 + version: 5.1.1 type-fest: specifier: ^5.1.0 version: 5.1.0 @@ -1042,6 +1048,9 @@ packages: resolution: {integrity: sha512-UYmTpOBwgPScZpS4A+YbapwWuBwasxvO/2IOHArSsAhL/+ZdmATBXTex3t+l2hXwLVYK382ibr/nKoY9GKe86w==} hasBin: true + bmp-js@0.1.0: + resolution: {integrity: sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==} + brace-expansion@1.1.11: resolution: {integrity: sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==} @@ -1730,6 +1739,9 @@ packages: resolution: {integrity: sha512-2y91h5OpQlolefMPmUlivelittSWy0rP+oYVpn6A7GwVHNE8AWzoYOBNmlwks3LobaJxgHCYZAnyNo2GgpNRNQ==} engines: {node: '>=0.10.0'} + idb-keyval@6.2.2: + resolution: {integrity: sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==} + ignore@5.3.2: resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} engines: {node: '>= 4'} @@ -1821,6 +1833,9 @@ packages: resolution: {integrity: sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==} engines: {node: '>= 0.4'} + is-electron@2.2.2: + resolution: {integrity: sha512-FO/Rhvz5tuw4MCWkpMzHFKWD2LsfHzIb7i6MdPYZ/KW7AlxawyLkqdy+jPZP1WubqEADE3O4FUENlJHDfQASRg==} + is-empty-iterable@3.0.0: resolution: {integrity: sha512-ZXVNGZrRvda9spnGVME3nTYTyDNjCTrmRy3DfDjBaMQ7aftcPsy/vkJoLL47IwcAbgioIfGvjQJWdit8GiggPg==} engines: {node: '>=12'} @@ -1936,6 +1951,9 @@ packages: resolution: {integrity: sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==} engines: {node: '>= 0.4'} + is-url@1.2.4: + resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==} + is-weakmap@2.0.2: resolution: {integrity: sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==} engines: {node: '>= 0.4'} @@ -2129,6 +2147,15 @@ packages: natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} + node-fetch@2.7.0: + resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} + engines: {node: 4.x || >=6.0.0} + peerDependencies: + encoding: ^0.1.0 + peerDependenciesMeta: + encoding: + optional: true + node-id3@0.2.6: resolution: {integrity: sha512-w8GuKXLlPpDjTxLowCt/uYMhRQzED3cg2GdSG1i6RSGKeDzPvxlXeLQuQInKljahPZ0aDnmyX7FX8BbJOM7REg==} @@ -2203,6 +2230,10 @@ packages: resolution: {integrity: sha512-8EcOGJk/JXFaoGjeFM53Z3zBnwOpKtZeu5X0wts67WqA1PTnsmwRgUw9aGAsQ5V6cuTfJUv282h1ypFgDGPDSA==} engines: {node: '>=18'} + opencollective-postinstall@2.0.3: + resolution: {integrity: sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==} + hasBin: true + optionator@0.9.4: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} @@ -2355,6 +2386,9 @@ packages: resolution: {integrity: sha512-fmfw4XgoDke3kdI6h4xcUz1dG8uaiv5q9gcEwLS4Pnth2kxT+GZ7YehS1JTMGBQmtV7Y4GFGbs2re2NqhdozUg==} engines: {node: '>= 0.4'} + regenerator-runtime@0.13.11: + resolution: {integrity: sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==} + regenerator-runtime@0.14.1: resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==} @@ -2629,6 +2663,12 @@ packages: resolution: {integrity: sha512-7jDLIdD2Zp0bDe5r3D2qtkd1QOCacylBuL7oa4udvN6v2pqr4+LcCr67C8DR1zkpaZ8XosF5m1yQSabKAW6f2g==} engines: {node: '>=14.16'} + tesseract.js-core@5.1.1: + resolution: {integrity: sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==} + + tesseract.js@5.1.1: + resolution: {integrity: sha512-lzVl/Ar3P3zhpUT31NjqeCo1f+D5+YfpZ5J62eo2S14QNVOmHBTtbchHm/YAbOOOzCegFnKf4B3Qih9LuldcYQ==} + tiny-inflate@1.0.3: resolution: {integrity: sha512-pkY1fj1cKHb2seWDy0B16HeWyczlJA9/WW3u3c4z/NiWDsO3DOU5D7nhTLE9CF0yXv/QZFY7sEJmj24dK+Rrqw==} @@ -2662,6 +2702,9 @@ packages: resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} engines: {node: '>=8.0'} + tr46@0.0.3: + resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + ts-api-utils@2.1.0: resolution: {integrity: sha512-CUgTZL1irw8u29bzrOD/nH85jqyc74D6SshFgujOIA7osm2Rz7dYH77agkx7H4FBNxDq7Cjf+IjaX/8zwFW+ZQ==} engines: {node: '>=18.12'} @@ -2858,6 +2901,15 @@ packages: jsdom: optional: true + wasm-feature-detect@1.8.0: + resolution: {integrity: sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==} + + webidl-conversions@3.0.1: + resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} + + whatwg-url@5.0.0: + resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} + which-boxed-primitive@1.0.2: resolution: {integrity: sha512-bwZdv0AKLpplFY2KZRX6TvyuN7ojjr7lwkg6ml0roIy9YeuSr7JS372qlNW18UQYzgYK9ziGcerWqZOmEn9VNg==} @@ -2939,6 +2991,9 @@ packages: zero-fill@2.2.4: resolution: {integrity: sha512-/N5GEDauLHz2uGnuJXWO1Wfib4EC+q4yp9C1jojM7RubwEKADqIqMcYpETMm1lRop403fi3v1qTOdgDE8DIOdw==} + zlibjs@0.3.1: + resolution: {integrity: sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==} + zod-validation-error@4.0.2: resolution: {integrity: sha512-Q6/nZLe6jxuU80qb/4uJ4t5v2VEZ44lzQjPDhYJNztRQ4wyWc6VF3D3Kb/fAuPetZQnhS3hnajCf9CsWesghLQ==} engines: {node: '>=18.0.0'} @@ -3888,6 +3943,8 @@ snapshots: baseline-browser-mapping@2.8.18: {} + bmp-js@0.1.0: {} + brace-expansion@1.1.11: dependencies: balanced-match: 1.0.2 @@ -4838,6 +4895,8 @@ snapshots: dependencies: safer-buffer: 2.1.2 + idb-keyval@6.2.2: {} + ignore@5.3.2: {} ignore@7.0.5: {} @@ -4934,6 +4993,8 @@ snapshots: call-bound: 1.0.4 has-tostringtag: 1.0.2 + is-electron@2.2.2: {} + is-empty-iterable@3.0.0: {} is-extglob@2.1.1: {} @@ -5034,6 +5095,8 @@ snapshots: dependencies: which-typed-array: 1.1.19 + is-url@1.2.4: {} + is-weakmap@2.0.2: {} is-weakref@1.0.2: @@ -5209,6 +5272,10 @@ snapshots: natural-compare@1.4.0: {} + node-fetch@2.7.0: + dependencies: + whatwg-url: 5.0.0 + node-id3@0.2.6: dependencies: iconv-lite: 0.6.2 @@ -5300,6 +5367,8 @@ snapshots: dependencies: ky: 1.12.0 + opencollective-postinstall@2.0.3: {} + optionator@0.9.4: dependencies: deep-is: 0.1.4 @@ -5439,6 +5508,8 @@ snapshots: globalthis: 1.0.4 which-builtin-type: 1.1.4 + regenerator-runtime@0.13.11: {} + regenerator-runtime@0.14.1: {} regexp-tree@0.1.27: {} @@ -5815,6 +5886,23 @@ snapshots: type-fest: 2.19.0 unique-string: 3.0.0 + tesseract.js-core@5.1.1: {} + + tesseract.js@5.1.1: + dependencies: + bmp-js: 0.1.0 + idb-keyval: 6.2.2 + is-electron: 2.2.2 + is-url: 1.2.4 + node-fetch: 2.7.0 + opencollective-postinstall: 2.0.3 + regenerator-runtime: 0.13.11 + tesseract.js-core: 5.1.1 + wasm-feature-detect: 1.8.0 + zlibjs: 0.3.1 + transitivePeerDependencies: + - encoding + tiny-inflate@1.0.3: {} tinybench@2.9.0: {} @@ -5840,6 +5928,8 @@ snapshots: dependencies: is-number: 7.0.0 + tr46@0.0.3: {} + ts-api-utils@2.1.0(typescript@5.9.3): dependencies: typescript: 5.9.3 @@ -6074,6 +6164,15 @@ snapshots: - tsx - yaml + wasm-feature-detect@1.8.0: {} + + webidl-conversions@3.0.1: {} + + whatwg-url@5.0.0: + dependencies: + tr46: 0.0.3 + webidl-conversions: 3.0.1 + which-boxed-primitive@1.0.2: dependencies: is-bigint: 1.0.4 @@ -6189,6 +6288,8 @@ snapshots: zero-fill@2.2.4: {} + zlibjs@0.3.1: {} + zod-validation-error@4.0.2(zod@4.1.12): dependencies: zod: 4.1.12 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index cd1eded..e9790ff 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -6,5 +6,7 @@ onlyBuiltDependencies: - esbuild - sharp - simple-git-hooks + - tesseract.js + - unrealspeech-api packageManagerStrict: false diff --git a/readme.md b/readme.md index e73f81c..8f50a84 100644 --- a/readme.md +++ b/readme.md @@ -16,6 +16,8 @@ - [Setup Env Vars](#setup-env-vars) - [Extract Kindle Book](#extract-kindle-book) - [Transcribe Book Content](#transcribe-book-content) + - [(Optional) Validate Content](#optional-validate-content) + - [(Optional) Clean Content with Ollama](#optional-clean-content-with-ollama) - [(Optional) Export Book as PDF](#optional-export-book-as-pdf) - [(Optional) Export Book as EPUB](#optional-export-book-as-epub) - [(Optional) Export Book as Markdown](#optional-export-book-as-markdown) @@ -28,13 +30,13 @@ ## Intro -This project makes it easy to export the contents of any ebook in your Kindle library as text, PDF, EPUB, or as a custom, AI-narrated audiobook. It only requires a valid Amazon Kindle account and an OpenAI API key. +This project makes it easy to export the contents of any ebook in your Kindle library as text, PDF, EPUB, or as a custom, AI-narrated audiobook. It requires a valid Amazon Kindle account and either an OpenAI API key or a local Ollama installation. _You must own the ebook on Kindle for this project to work._ ### How does it work? -It works by logging into your [Kindle web reader](https://read.amazon.com) account using [Playwright](https://playwright.dev), exporting each page of a book as a PNG image, and then using a vLLM (`gpt-4o` or `gpt-4o-mini`) to transcribe the text from each page to text. Once we have the raw book contents and metadata, then it's easy to convert it to PDF, EPUB, etc. šŸ”„ +It works by logging into your [Kindle web reader](https://read.amazon.com) account using [Playwright](https://playwright.dev), exporting each page of a book as a PNG image, and then using either a vLLM (OpenAI's `gpt-4o`/`gpt-4o-mini` or Ollama vision models) to transcribe the text from each page. Once we have the raw book contents and metadata, then it's easy to convert it to PDF, EPUB, etc. šŸ”„ This [example](./examples/B0819W19WD) uses the first page of the scifi book [Revelation Space](https://www.amazon.com/gp/product/B0819W19WD?ref_=dbs_m_mng_rwt_calw_tkin_0&storeType=ebooks) by [Alastair Reynolds](https://www.goodreads.com/author/show/51204.Alastair_Reynolds): @@ -153,11 +155,15 @@ Make sure you have `node >= 18` and [pnpm](https://pnpm.io) installed. 2. Run `pnpm install` 3. Set up environment variables ([details](#setup-env-vars)) 4. Run `src/extract-kindle-book.ts` ([details](#extract-kindle-book)) -5. Run `src/transcribe-book-content.ts` ([details](#transcribe-book-content)) -6. (Optional) Run `src/export-book-pdf.ts` ([details](#optional-export-book-as-pdf)) -7. (Optional) Export book as EPUB ([details](#optional-export-book-as-epub)) -8. (Optional) Run `src/export-book-markdown.ts` ([details](#optional-export-book-as-markdown)) -9. (Optional) Run `src/export-book-audio.ts` ([details](#optional-export-book-as-ai-narrated-audiobook-)) +5. Transcribe the book content using one of these options: + - **Option A**: Run `src/transcribe-book-content.ts` for AI vision models ([details](#transcribe-book-content)) + - **Option B**: Run `src/ocr-transcribe-book-content.ts` for free local OCR ([details](#transcribe-book-content)) +6. (Optional) Run `src/validate-content.ts` to check for OCR errors ([details](#optional-validate-content)) +7. (Optional) Run `src/clean-content-with-ollama.ts` to clean up text formatting ([details](#optional-clean-content-with-ollama)) +8. (Optional) Run `src/export-book-pdf.ts` ([details](#optional-export-book-as-pdf)) +9. (Optional) Export book as EPUB ([details](#optional-export-book-as-epub)) +10. (Optional) Run `src/export-book-markdown.ts` ([details](#optional-export-book-as-markdown)) +11. (Optional) Run `src/export-book-audio.ts` ([details](#optional-export-book-as-ai-narrated-audiobook-)) ### Setup Env Vars @@ -168,11 +174,29 @@ AMAZON_EMAIL= AMAZON_PASSWORD= ASIN= +# Option 1: Using OpenAI (default) +AI_PROVIDER=openai OPENAI_API_KEY= + +# Option 2: Using Ollama (Local/Self-hosted) +# Uncomment these lines and comment out OPENAI_API_KEY to use Ollama +# AI_PROVIDER=ollama +# OLLAMA_BASE_URL=http://localhost:11434 +# OLLAMA_VISION_MODEL=qwen2.5vl:7b # For vision/OCR transcription +# OLLAMA_MODEL=llama3.2 # For text cleaning with clean-content-with-ollama.ts +# OLLAMA_CONCURRENCY=16 + +# Option 3: Using Tesseract.js OCR (free, local, no API needed) +# OCR_CONCURRENCY=4 # Number of parallel workers (default: 4) ``` You can find your book's [ASIN](https://en.wikipedia.org/wiki/Amazon_Standard_Identification_Number) (Amazon ID) by visiting [read.amazon.com](https://read.amazon.com) and clicking on the book you want to export. The resulting URL will look like `https://read.amazon.com/?asin=B0819W19WD&ref_=kwl_kr_iv_rec_2`, with `B0819W19WD` being the ASIN in this case. +**For Ollama users**: Make sure you have a vision-capable model installed. Recommended models: +- `qwen2.5vl:7b` - Recommended, good balance of speed and quality +- `qwen2.5vl:32b` - Higher quality but slower +- `llama3.2-vision:latest` - Alternative option + ### Extract Kindle Book ```sh @@ -197,16 +221,59 @@ npx tsx src/extract-kindle-book.ts ### Transcribe Book Content +#### Option A: Using AI Vision Models (Recommended) + ```sh npx tsx src/transcribe-book-content.ts ``` - _(This takes a few minutes to run)_ -- This takes each of the page screenshots and runs them through a vLLM (`gpt-4o` or `gpt-4o-mini`) to extract the raw text content from each page of the book. -- It then stitches these text chunks together, taking into account chapter boundaries. -- The result is stored as JSON to `out/${asin}/content.json`. +- This takes each of the page screenshots and runs them through a vision model to extract the raw text content from each page +- Supports both OpenAI (`gpt-4o`/`gpt-4o-mini`) and Ollama (local vision models) +- It then stitches these text chunks together, taking into account chapter boundaries +- The result is stored as JSON to `out/${asin}/content.json` - Example: [examples/B0819W19WD/content.json](./examples/B0819W19WD/content.json) +#### Option B: Using OCR with Tesseract.js (Alternative) + +```sh +npx tsx src/ocr-transcribe-book-content.ts +``` + +- Fast, free, and runs entirely locally without any API calls +- Uses Tesseract.js for optical character recognition +- May have slightly lower accuracy than vision models on complex layouts +- Supports configurable concurrency via `OCR_CONCURRENCY` env variable (default: 4) +- The result is stored to the same `out/${asin}/content.json` file + +### (Optional) Validate Content + +After transcription, you can check for OCR errors: + +```sh +npx tsx src/validate-content.ts +``` + +- Checks for common OCR issues like repetitive text, excessive punctuation, or empty pages +- Generates a validation report showing errors and warnings +- Saves the report to `out/${asin}/validation-report.json` +- Useful for identifying pages that may need manual correction + +### (Optional) Clean Content with Ollama + +Use Ollama's text models to clean up formatting issues in the transcribed content: + +```sh +npx tsx src/clean-content-with-ollama.ts +``` + +- Uses Ollama's text models (like `llama3.2`) to improve text formatting +- Fixes paragraph breaks and removes excessive whitespace +- Preserves original words exactly (no spelling/grammar changes) +- Requires `OLLAMA_MODEL` environment variable (defaults to `llama3.2`) +- Saves cleaned content to `out/${asin}/content-cleaned.json` +- Creates a backup of the original at `out/${asin}/content-original.json` + ### (Optional) Export Book as PDF ```sh diff --git a/src/clean-content-with-ollama.ts b/src/clean-content-with-ollama.ts new file mode 100644 index 0000000..6296476 --- /dev/null +++ b/src/clean-content-with-ollama.ts @@ -0,0 +1,142 @@ +import 'dotenv/config' + +import fs from 'node:fs/promises' +import path from 'node:path' + +// import ky from 'ky' // ky doesn't work well in Node.js with Ollama +import pMap from 'p-map' + +import type { ContentChunk } from './types' +import { assert, getEnv } from './utils' + +async function main() { + const asin = getEnv('ASIN') + assert(asin, 'ASIN is required') + + const ollamaBaseUrl = getEnv('OLLAMA_BASE_URL') || 'http://localhost:11434' + const ollamaModel = getEnv('OLLAMA_MODEL') || 'llama3.2' + const concurrency = Math.max(1, Math.min(8, Number.parseInt(getEnv('OLLAMA_CONCURRENCY') || '4', 10))) + + const outDir = path.join('out', asin) + const contentPath = path.join(outDir, 'content.json') + + // Read existing content + const contentRaw = await fs.readFile(contentPath, 'utf-8') + const content = JSON.parse(contentRaw) as ContentChunk[] + assert(content.length > 0, 'No content found in content.json') + + console.log(`Loaded ${content.length} chunks from content.json`) + console.log(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaModel}`) + console.log(`Concurrency: ${concurrency} parallel requests\n`) + + // Warm up the model + console.log('Warming up model...') + try { + const response = await fetch(`${ollamaBaseUrl}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: ollamaModel, + messages: [{ role: 'user', content: 'Hi' }], + stream: false + }), + signal: AbortSignal.timeout(60000) + }) + await response.json() + console.log('Model ready!\n') + } catch (err) { + console.warn('Model warmup failed:', (err as Error).message) + } + + let processed = 0 + const cleanedContent: ContentChunk[] = await pMap( + content, + async (chunk) => { + const { index, page, text, screenshot } = chunk + + // Skip empty or very short text + if (!text || text.length < 20) { + processed++ + console.log(`[${processed}/${content.length}] Skipped chunk ${index} (too short)`) + return chunk + } + + try { + const response = await fetch(`${ollamaBaseUrl}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: ollamaModel, + messages: [ + { + role: 'system', + content: `You are a text formatting assistant. Your job is to clean up OCR text by fixing paragraph breaks and spacing while preserving the original content exactly. + +Rules: +1. Fix paragraph breaks - add proper line breaks between paragraphs +2. Remove excessive whitespace and fix spacing issues +3. DO NOT change, add, or remove any words +4. DO NOT fix spelling or grammar +5. DO NOT add punctuation +6. Keep dialogue and quoted text exactly as-is +7. Preserve chapter titles and headings +8. Output ONLY the cleaned text, no explanations or comments` + }, + { + role: 'user', + content: `Clean up this text:\n\n${text}` + } + ], + stream: false, + options: { + temperature: 0, + num_predict: 2048 + } + }), + signal: AbortSignal.timeout(120000) + }) + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status} ${response.statusText}`) + } + + const data = await response.json() as { message: { content: string } } + const cleanedText = data.message.content.trim() + + processed++ + const changePercent = Math.abs(cleanedText.length - text.length) / text.length * 100 + console.log( + `[${processed}/${content.length}] Chunk ${index} (page ${page}): ` + + `${text.length} → ${cleanedText.length} chars (${changePercent.toFixed(1)}% change)` + ) + + return { + index, + page, + text: cleanedText, + screenshot + } + } catch (err) { + console.error(`Error processing chunk ${index}:`, (err as Error).message) + processed++ + return chunk // Return original on error + } + }, + { concurrency } + ) + + // Backup original + const backupPath = path.join(outDir, 'content.backup.json') + await fs.writeFile(backupPath, contentRaw) + console.log(`\nāœ“ Backed up original to ${backupPath}`) + + // Save cleaned content + await fs.writeFile( + contentPath, + JSON.stringify(cleanedContent, null, 2) + ) + console.log(`āœ“ Saved cleaned content to ${contentPath}`) + console.log(`\nāœ“ Processed ${cleanedContent.length} chunks successfully!`) +} + +await main() diff --git a/src/export-book-pdf.ts b/src/export-book-pdf.ts index dcfcc3f..48d0f7f 100644 --- a/src/export-book-pdf.ts +++ b/src/export-book-pdf.ts @@ -65,12 +65,13 @@ async function main() { let needsNewPage = false let index = 0 - for (let i = 0; i < metadata.toc.length - 1; i++) { + for (let i = 0; i < metadata.toc.length; i++) { const tocItem = metadata.toc[i]! if (tocItem.page === undefined) continue - const nextTocItem = metadata.toc[i + 1]! - const nextIndex = nextTocItem.page + // Find the next TOC item's page to determine chapter boundary + const nextTocItem = metadata.toc[i + 1] + const nextIndex = nextTocItem?.page ? content.findIndex((c) => c.page >= nextTocItem.page!) : content.length if (nextIndex < index) continue diff --git a/src/extract-kindle-book.ts b/src/extract-kindle-book.ts index 03d2222..31c4109 100644 --- a/src/extract-kindle-book.ts +++ b/src/extract-kindle-book.ts @@ -252,11 +252,19 @@ async function main() { // Only enter 2-factor auth code if needed if (code) { await page.locator('input[type="tel"]').fill(code) - await page + + // Try multiple possible selectors for the submit button + const submitButton = page .locator( - 'input[type="submit"][aria-labelledby="cvf-submit-otp-button-announce"]' + 'input[type="submit"][aria-labelledby="cvf-submit-otp-button-announce"], ' + + 'input[type="submit"]#cvf-submit-otp-button, ' + + 'input[type="submit"][name="cvf-submit-otp-button"], ' + + 'button[type="submit"], ' + + 'input[type="submit"]' ) - .click() + .first() + + await submitButton.click() } } diff --git a/src/ocr-transcribe-book-content.ts b/src/ocr-transcribe-book-content.ts new file mode 100644 index 0000000..6be00c9 --- /dev/null +++ b/src/ocr-transcribe-book-content.ts @@ -0,0 +1,89 @@ +import 'dotenv/config' + +import fs from 'node:fs/promises' +import path from 'node:path' + +import { globby } from 'globby' +import pMap from 'p-map' +import { createWorker } from 'tesseract.js' + +import type { ContentChunk } from './types' +import { assert, getEnv } from './utils' + +async function main() { + const asin = getEnv('ASIN') + assert(asin, 'ASIN is required') + + const outDir = path.join('out', asin) + const pageScreenshotsDir = path.join(outDir, 'pages') + const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`) + assert(pageScreenshots.length, 'no page screenshots found') + + // Get concurrency setting (default 4 for OCR to balance speed/memory) + const concurrency = Math.max(1, Math.min(16, Number.parseInt(getEnv('OCR_CONCURRENCY') || '4', 10))) + + // Create a worker pool + const workers = await Promise.all( + Array.from({ length: concurrency }, async () => { + const worker = await createWorker('eng') + return worker + }) + ) + + const content: ContentChunk[] = ( + await pMap( + pageScreenshots, + async (screenshot, workerIndex) => { + const metadataMatch = screenshot.match(/0*(\d+)-\0*(\d+).png/) + assert( + metadataMatch?.[1] && metadataMatch?.[2], + `invalid screenshot filename: ${screenshot}` + ) + const index = Number.parseInt(metadataMatch[1]!, 10) + const page = Number.parseInt(metadataMatch[2]!, 10) + assert( + !Number.isNaN(index) && !Number.isNaN(page), + `invalid screenshot filename: ${screenshot}` + ) + + try { + // Use worker from pool (round-robin) + const worker = workers[workerIndex % workers.length]! + const { data } = await worker.recognize(screenshot) + + const rawText = data.text || '' + + const text = rawText + .replace(/^\s*\d+\s*$\n+/m, '') + // .replaceAll(/\n+/g, '\n') + .replaceAll(/^\s*/gm, '') + .replaceAll(/\s*$/gm, '') + + const result: ContentChunk = { + index, + page, + text, + screenshot + } + console.log(result) + + return result + } catch (err) { + console.error(`error processing image ${index} (${screenshot})`, err) + } + }, + { concurrency } + ) + ).filter(Boolean) + + // Terminate all workers + await Promise.all(workers.map((w: any) => w.terminate())) + + await fs.writeFile( + path.join(outDir, 'content.json'), + JSON.stringify(content, null, 2) + ) + console.log(JSON.stringify(content, null, 2)) +} + +await main() diff --git a/src/transcribe-book-content.ts b/src/transcribe-book-content.ts index 0e6082f..5ab2c36 100644 --- a/src/transcribe-book-content.ts +++ b/src/transcribe-book-content.ts @@ -20,6 +20,8 @@ async function main() { assert(metadata.pages?.length, 'no page screenshots found') assert(metadata.toc?.length, 'invalid book metadata: missing toc') + console.log(`Found ${metadata.pages.length} pages to transcribe`) + // eslint-disable-next-line unicorn/no-array-reduce const pageToTocItemMap = metadata.toc.reduce( (acc, tocItem) => { @@ -35,12 +37,60 @@ async function main() { // const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`) // assert(pageScreenshots.length, 'no page screenshots found') - const openai = new OpenAIClient() + // Check which AI provider to use + const aiProvider = getEnv('AI_PROVIDER') || 'openai' + const ollamaBaseUrl = getEnv('OLLAMA_BASE_URL') + const ollamaVisionModel = getEnv('OLLAMA_VISION_MODEL') + + // Get configurable concurrency for Ollama + const ollamaConcurrency = aiProvider === 'ollama' + ? Math.max(1, Math.min(16, Number.parseInt(getEnv('OLLAMA_CONCURRENCY') || '16', 10))) + : 16 + + let openai: OpenAIClient | undefined + if (aiProvider === 'openai') { + openai = new OpenAIClient() + } else if (aiProvider === 'ollama') { + assert(ollamaBaseUrl, 'OLLAMA_BASE_URL is required when using ollama provider') + assert(ollamaVisionModel, 'OLLAMA_VISION_MODEL is required when using ollama provider') + console.log(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaVisionModel}`) + console.log(`Concurrency: ${ollamaConcurrency} parallel requests`) + + // Warm up the model with a simple request + console.log('Warming up model...') + try { + const response = await fetch(`${ollamaBaseUrl}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: ollamaVisionModel, + messages: [ + { + role: 'user', + content: 'Hello' + } + ], + stream: false + }), + signal: AbortSignal.timeout(120000) // 2 minute timeout + }) + const warmupResponse = await response.json() + console.log('Model warmed up successfully!') + } catch (err: any) { + console.error('Model warmup failed:', err) + console.error('Error details:', err.message, err.stack) + // Don't continue if warmup fails - there's likely a configuration issue + throw new Error(`Failed to warm up Ollama model: ${err.message}`) + } + } + + console.log(`Starting transcription with concurrency: ${aiProvider === 'ollama' ? ollamaConcurrency : 16}`) const content: ContentChunk[] = ( await pMap( metadata.pages, async (pageChunk, pageChunkIndex) => { + console.log(`Processing page ${pageChunk.page} (${pageChunkIndex + 1}/${metadata.pages.length})`) const { screenshot, index, page } = pageChunk const screenshotBuffer = await fs.readFile(screenshot) const screenshotBase64 = `data:image/png;base64,${screenshotBuffer.toString('base64')}` @@ -61,31 +111,70 @@ async function main() { let retries = 0 do { - const res = await openai.createChatCompletion({ - model: 'gpt-4.1-mini', - temperature: retries < 2 ? 0 : 0.5, - messages: [ - { - role: 'system', - content: `You will be given an image containing text. Read the text from the image and output it verbatim. + let rawText: string + + if (aiProvider === 'openai') { + const res = await openai!.createChatCompletion({ + model: 'gpt-4.1-mini', + temperature: retries < 2 ? 0 : 0.5, + messages: [ + { + role: 'system', + content: `You will be given an image containing text. Read the text from the image and output it verbatim. Do not include any additional text, descriptions, or punctuation. Ignore any embedded images. Do not use markdown.${retries > 2 ? '\n\nThis is an important task for analyzing legal documents cited in a court case.' : ''}` - }, - { - role: 'user', - content: [ - { - type: 'image_url', - image_url: { - url: screenshotBase64 + }, + { + role: 'user', + content: [ + { + type: 'image_url', + image_url: { + url: screenshotBase64 + } } + ] as any + } + ] + }) + rawText = res.choices[0]!.message.content! + } else { + // Ollama API + const response = await fetch(`${ollamaBaseUrl}/api/chat`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: ollamaVisionModel, + messages: [ + { + role: 'system', + content: `You will be given an image containing text. Read the text from the image and output it verbatim. + +Do not include any additional text, descriptions, or punctuation. Ignore any embedded images. Do not use markdown.${retries > 2 ? '\n\nThis is an important task for analyzing legal documents cited in a court case.' : ''}` + }, + { + role: 'user', + content: 'Please transcribe all the text visible in this image.', + images: [screenshotBase64.replace('data:image/png;base64,', '')] } - ] as any - } - ] - }) + ], + stream: false, + options: { + temperature: retries < 2 ? 0 : 0.5, + num_predict: 1024, + num_ctx: 4096 + } + }), + signal: AbortSignal.timeout(120000) // 2 minute timeout + }) + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status} ${response.statusText}`) + } - const rawText = res.choices[0]!.message.content! + const data = await response.json() as { message: { content: string } } + rawText = data.message.content + } let text = rawText .replace(/^\s*\d+\s*$\n+/m, '') // .replaceAll(/\n+/g, '\n') @@ -138,7 +227,7 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb console.error(`error processing image ${index} (${screenshot})`, err) } }, - { concurrency: 16 } + { concurrency: aiProvider === 'ollama' ? ollamaConcurrency : 16 } ) ).filter(Boolean) diff --git a/src/validate-content.ts b/src/validate-content.ts new file mode 100644 index 0000000..13c4c29 --- /dev/null +++ b/src/validate-content.ts @@ -0,0 +1,232 @@ +import 'dotenv/config' + +import fs from 'node:fs/promises' +import path from 'node:path' + +import type { ContentChunk } from './types' +import { assert, getEnv } from './utils' + +interface ValidationIssue { + index: number + page: number + type: string + severity: 'error' | 'warning' + message: string + preview?: string +} + +async function main() { + const asin = getEnv('ASIN') + assert(asin, 'ASIN is required') + + const outDir = path.join('out', asin) + const contentPath = path.join(outDir, 'content.json') + + console.log('Reading content.json...') + const contentJson = await fs.readFile(contentPath, 'utf-8') + const content: ContentChunk[] = JSON.parse(contentJson) + + console.log(`Validating ${content.length} pages...\n`) + + const issues: ValidationIssue[] = [] + + for (const chunk of content) { + const { index, page, text } = chunk + + // 1. Check for repetitive sentences (3+ times) + const sentences = text.match(/[^.!?]+[.!?]+/g) || [] + const sentenceCounts = new Map() + for (const sentence of sentences) { + const trimmed = sentence.trim() + if (trimmed.length > 20) { + sentenceCounts.set(trimmed, (sentenceCounts.get(trimmed) || 0) + 1) + if (sentenceCounts.get(trimmed)! >= 3) { + issues.push({ + index, + page, + type: 'repetitive_sentence', + severity: 'error', + message: `Sentence repeated ${sentenceCounts.get(trimmed)} times`, + preview: trimmed.substring(0, 80) + }) + } + } + } + + // 2. Check for excessive dashes or ellipsis (model looping on punctuation) + const dashCount = (text.match(/—/g) || []).length + const ellipsisCount = (text.match(/\.\.\./g) || []).length + + if (dashCount > 50) { + issues.push({ + index, + page, + type: 'excessive_dashes', + severity: 'error', + message: `Contains ${dashCount} em-dashes (likely model loop)`, + preview: text.substring(text.indexOf('—'), text.indexOf('—') + 100) + }) + } + + if (ellipsisCount > 20) { + issues.push({ + index, + page, + type: 'excessive_ellipsis', + severity: 'error', + message: `Contains ${ellipsisCount} ellipsis (likely model loop)`, + preview: text.substring(text.indexOf('...'), text.indexOf('...') + 100) + }) + } + + // 3. Check for very short text (possible OCR failure) + if (text.length < 50) { + issues.push({ + index, + page, + type: 'very_short_text', + severity: 'warning', + message: `Only ${text.length} characters`, + preview: text + }) + } + + // 4. Check for duplicate consecutive lines + const lines = text.split('\n') + for (let i = 1; i < lines.length; i++) { + if (lines[i] === lines[i - 1] && lines[i]!.trim().length > 10) { + issues.push({ + index, + page, + type: 'duplicate_line', + severity: 'warning', + message: 'Contains duplicate consecutive lines', + preview: lines[i]!.substring(0, 80) + }) + break // Only report once per page + } + } + + // 5. Check for duplicate consecutive words + const duplicateWordMatch = text.match(/\b(\w+)\s+\1\b/) + if (duplicateWordMatch) { + issues.push({ + index, + page, + type: 'duplicate_word', + severity: 'warning', + message: `Duplicate word: "${duplicateWordMatch[1]}"`, + preview: text.substring(duplicateWordMatch.index!, duplicateWordMatch.index! + 100) + }) + } + + // 6. Check for unusual character patterns (gibberish) + const gibberishPattern = /([a-z])\1{4,}/i // Same letter 5+ times in a row + if (gibberishPattern.test(text)) { + const match = text.match(gibberishPattern) + issues.push({ + index, + page, + type: 'possible_gibberish', + severity: 'warning', + message: `Unusual character pattern detected`, + preview: text.substring(match!.index!, match!.index! + 100) + }) + } + + // 7. Check for very long text (possible repetition that's not sentence-based) + if (text.length > 5000) { + issues.push({ + index, + page, + type: 'unusually_long', + severity: 'warning', + message: `Text is ${text.length} characters (avg is ~800)`, + preview: text.substring(0, 100) + }) + } + + // 8. Check for incomplete sentences at the end (cut off mid-word) + const lastSentence = sentences[sentences.length - 1]?.trim() || '' + if (lastSentence.length > 0 && !lastSentence.match(/[.!?]$/)) { + // Only warn if it doesn't end with common incomplete patterns like ellipsis + if (!lastSentence.endsWith('...') && lastSentence.length > 50) { + issues.push({ + index, + page, + type: 'incomplete_sentence', + severity: 'warning', + message: 'Last sentence appears incomplete', + preview: lastSentence.substring(Math.max(0, lastSentence.length - 80)) + }) + } + } + } + + // Group and display issues + const errors = issues.filter(i => i.severity === 'error') + const warnings = issues.filter(i => i.severity === 'warning') + + console.log('=== VALIDATION REPORT ===\n') + + if (errors.length > 0) { + console.log(`\nšŸ”“ ERRORS (${errors.length}):`) + console.log('These likely need manual review or re-processing:\n') + + for (const issue of errors) { + console.log(`Page ${issue.page} (index ${issue.index}) - ${issue.type}`) + console.log(` ${issue.message}`) + if (issue.preview) { + console.log(` Preview: ${issue.preview}...`) + } + console.log() + } + } + + if (warnings.length > 0) { + console.log(`\nāš ļø WARNINGS (${warnings.length}):`) + console.log('These might be fine, but worth checking:\n') + + // Group warnings by type + const warningsByType = new Map() + for (const warning of warnings) { + const existing = warningsByType.get(warning.type) || [] + existing.push(warning) + warningsByType.set(warning.type, existing) + } + + for (const [type, typeWarnings] of warningsByType) { + console.log(`${type} (${typeWarnings.length} pages):`) + for (const warning of typeWarnings.slice(0, 5)) { // Show first 5 + console.log(` Page ${warning.page}: ${warning.message}`) + } + if (typeWarnings.length > 5) { + console.log(` ... and ${typeWarnings.length - 5} more`) + } + console.log() + } + } + + // Summary + console.log('\n=== SUMMARY ===') + console.log(`Total pages: ${content.length}`) + console.log(`Pages with errors: ${new Set(errors.map(i => i.index)).size}`) + console.log(`Pages with warnings: ${new Set(warnings.map(i => i.index)).size}`) + console.log(`Clean pages: ${content.length - new Set([...errors, ...warnings].map(i => i.index)).size}`) + + const successRate = ((content.length - new Set(errors.map(i => i.index)).size) / content.length * 100).toFixed(1) + console.log(`\nSuccess rate (no errors): ${successRate}%`) + + // Save detailed report + const reportPath = path.join(outDir, 'validation-report.json') + await fs.writeFile(reportPath, JSON.stringify({ errors, warnings }, null, 2)) + console.log(`\nDetailed report saved to: ${reportPath}`) + + // List pages that need attention + const problematicPages = [...new Set(errors.map(i => i.index))].sort((a, b) => a - b) + if (problematicPages.length > 0) { + console.log(`\nPages that need attention: ${problematicPages.join(', ')}`) + } +} + +await main()