diff --git a/fixtures/001.expected b/fixtures/001-textual-pdf/001.expected.txt similarity index 100% rename from fixtures/001.expected rename to fixtures/001-textual-pdf/001.expected.txt diff --git a/fixtures/001.pdf b/fixtures/001-textual-pdf/001.input.pdf similarity index 100% rename from fixtures/001.pdf rename to fixtures/001-textual-pdf/001.input.pdf diff --git a/fixtures/002.expected b/fixtures/002-simple-txt/002.expected.txt similarity index 100% rename from fixtures/002.expected rename to fixtures/002-simple-txt/002.expected.txt diff --git a/fixtures/002.txt b/fixtures/002-simple-txt/002.input.txt similarity index 100% rename from fixtures/002.txt rename to fixtures/002-simple-txt/002.input.txt diff --git a/fixtures/003.expected b/fixtures/003-yaml/003.expected.txt similarity index 100% rename from fixtures/003.expected rename to fixtures/003-yaml/003.expected.txt diff --git a/fixtures/003.yaml b/fixtures/003-yaml/003.input.yaml similarity index 100% rename from fixtures/003.yaml rename to fixtures/003-yaml/003.input.yaml diff --git a/fixtures/004.expected b/fixtures/004-markdown/004.expected.txt similarity index 100% rename from fixtures/004.expected rename to fixtures/004-markdown/004.expected.txt diff --git a/fixtures/004.md b/fixtures/004-markdown/004.input.md similarity index 100% rename from fixtures/004.md rename to fixtures/004-markdown/004.input.md diff --git a/fixtures/005.csv b/fixtures/005-csv/005.expected.txt similarity index 100% rename from fixtures/005.csv rename to fixtures/005-csv/005.expected.txt diff --git a/fixtures/005.expected b/fixtures/005-csv/005.input.csv similarity index 100% rename from fixtures/005.expected rename to fixtures/005-csv/005.input.csv diff --git a/fixtures/006.expected b/fixtures/006-png/006.expected.txt similarity index 100% rename from fixtures/006.expected rename to fixtures/006-png/006.expected.txt diff --git a/fixtures/006.png b/fixtures/006-png/006.input.png similarity index 100% rename from fixtures/006.png rename to fixtures/006-png/006.input.png diff --git a/fixtures/007.expected b/fixtures/007-jpg/007.expected.txt similarity index 100% rename from fixtures/007.expected rename to fixtures/007-jpg/007.expected.txt diff --git a/fixtures/007.jpg b/fixtures/007-jpg/007.input.jpg similarity index 100% rename from fixtures/007.jpg rename to fixtures/007-jpg/007.input.jpg diff --git a/fixtures/008.expected b/fixtures/008-gif/008.expected.txt similarity index 100% rename from fixtures/008.expected rename to fixtures/008-gif/008.expected.txt diff --git a/fixtures/008.gif b/fixtures/008-gif/008.input.gif similarity index 100% rename from fixtures/008.gif rename to fixtures/008-gif/008.input.gif diff --git a/fixtures/009-png-with-french-text/009.config.ts b/fixtures/009-png-with-french-text/009.config.ts new file mode 100644 index 0000000..7406adb --- /dev/null +++ b/fixtures/009-png-with-french-text/009.config.ts @@ -0,0 +1,7 @@ +import type { PartialExtractorConfig } from '../../src/types'; + +export const config: PartialExtractorConfig = { + tesseract: { + languages: ['fra'], + }, +}; diff --git a/fixtures/009-png-with-french-text/009.expected.txt b/fixtures/009-png-with-french-text/009.expected.txt new file mode 100644 index 0000000..cd5f2fa --- /dev/null +++ b/fixtures/009-png-with-french-text/009.expected.txt @@ -0,0 +1,20 @@ +Vous savez, moi je ne crois pas qu'il y ait de bonne ou de +mauvaise situation. Moi, si je devais résumer ma vie +aujourd’hui avec vous, je dirais que c'est d'abord des +rencontres. Des gens qui m'ont tendu la main, peut-être à +un moment où je ne pouvais pas, où j'étais seul chez moi. +Et c'est assez curieux de se dire que les hasards, les +rencontres forgent une destinée... Parce que quand on a le +goût de la chose, quand on a le goût de la chose bien +faite, le beau geste, parfois on ne trouve pas +l'interlocuteur en face je dirais, le miroir qui vous aide à +avancer. Alors ça n’est pas mon cas, comme je disais là, +puisque moi au contraire, j'ai pu ; et je dis merci à la vie, +je lui dis merci, je chante la vie, je danse la vie... je ne suis +qu'amour ! Et finalement, quand des gens me disent « +Mais comment fais-tu pour avoir cette humanité ? », je +leur réponds très simplement que c'est ce goût de +l'amour, ce goût donc qui m'a poussé aujourd'hui à +entreprendre une construction mécanique... mais demain +qui sait ? Peut-être simplement à me mettre au service de +la communauté, à faire le don, le don de soi. diff --git a/fixtures/009-png-with-french-text/009.input.png b/fixtures/009-png-with-french-text/009.input.png new file mode 100644 index 0000000..08ae075 Binary files /dev/null and b/fixtures/009-png-with-french-text/009.input.png differ diff --git a/package.json b/package.json index bf366bd..14d6580 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "release": "bumpp --commit --tag --push" }, "dependencies": { + "@corentinth/chisels": "^1.3.1", "tesseract.js": "^6.0.0", "unpdf": "^0.12.1" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 51edc59..b5ff98c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,6 +8,9 @@ importers: .: dependencies: + '@corentinth/chisels': + specifier: ^1.3.1 + version: 1.3.1 tesseract.js: specifier: ^6.0.0 version: 6.0.0 @@ -181,6 +184,9 @@ packages: '@clack/prompts@0.9.1': resolution: {integrity: sha512-JIpyaboYZeWYlyP0H+OoPPxd6nqueG/CmN6ixBiNFsIDHREevjIf0n0Ohh5gr5C8pEDknzgvz+pIJ8dMhzWIeg==} + '@corentinth/chisels@1.3.1': + resolution: {integrity: sha512-Tnk3NqeyP4WUMCn/o3DNj8IYmkn6u6ilsB71+OLnRSfgKPfUUHV9XJLXra9h/SA9H+c/X1A4oSjKQBviV5qmFw==} + '@es-joy/jsdoccomment@0.49.0': resolution: {integrity: sha512-xjZTSFgECpb9Ohuk5yMX5RhUEbfeQcuOp8IF60e+wyzWEF0M5xeSgqsfLtvPEX8BIyOX9saZqzuGPmZ8oWc+5Q==} engines: {node: '>=16'} @@ -2981,6 +2987,8 @@ snapshots: picocolors: 1.1.1 sisteransi: 1.0.5 + '@corentinth/chisels@1.3.1': {} + '@es-joy/jsdoccomment@0.49.0': dependencies: comment-parser: 1.4.1 diff --git a/src/config.test.ts b/src/config.test.ts new file mode 100644 index 0000000..e9b8c73 --- /dev/null +++ b/src/config.test.ts @@ -0,0 +1,20 @@ +import { describe, expect, test } from 'vitest'; +import { parseConfig } from './config'; + +describe('config', () => { + describe('parseConfig', () => { + test('a non supported language for tesseract raises an error', () => { + expect(() => parseConfig({ rawConfig: { tesseract: { languages: ['invalid'] } } })).toThrow('Invalid languages for tesseract: invalid. Valid languages are: afr, amh, ara, asm, aze, aze_cyrl, bel, ben, bod, bos, bul, cat, ceb, ces, chi_sim, chi_tra, chr, cym, dan, deu, dzo, ell, eng, enm, epo, est, eus, fas, fin, fra, frk, frm, gle, glg, grc, guj, hat, heb, hin, hrv, hun, iku, ind, isl, ita, ita_old, jav, jpn, kan, kat, kat_old, kaz, khm, kir, kor, kur, lao, lat, lav, lit, mal, mar, mkd, mlt, msa, mya, nep, nld, nor, ori, pan, pol, por, pus, ron, rus, san, sin, slk, slv, spa, spa_old, sqi, srp, srp_latn, swa, swe, syr, tam, tel, tgk, tgl, tha, tir, tur, uig, ukr, urd, uzb, uzb_cyrl, vie, yid'); + }); + + test('when the ocr language is not specified, undefined or empty array, the default `eng` is used', () => { + const { config } = parseConfig({ rawConfig: { tesseract: { languages: [] } } }); + expect(config.tesseract.languages).to.eql(['eng']); + }); + + test('the ocr language can be a single language', () => { + const { config } = parseConfig({ rawConfig: { tesseract: { languages: ['fra'] } } }); + expect(config.tesseract.languages).to.eql(['fra']); + }); + }); +}); diff --git a/src/config.ts b/src/config.ts new file mode 100644 index 0000000..cd8b7e6 --- /dev/null +++ b/src/config.ts @@ -0,0 +1,21 @@ +import type { ExtractorConfig, PartialExtractorConfig } from './types'; +import { languages as tesseractLanguages } from 'tesseract.js'; + +const languages = Object.values(tesseractLanguages); + +export function parseConfig({ rawConfig = {} }: { rawConfig?: PartialExtractorConfig } = {}): { config: ExtractorConfig } { + const ocrLanguages = rawConfig.tesseract?.languages ?? []; + const invalidLanguages = ocrLanguages.filter(language => !languages.includes(language)); + + if (invalidLanguages.length > 0) { + throw new Error(`Invalid languages for tesseract: ${invalidLanguages.join(', ')}. Valid languages are: ${languages.join(', ')}`); + } + + return { + config: { + tesseract: { + languages: ocrLanguages.length > 0 ? ocrLanguages : ['eng'], + }, + }, + }; +} diff --git a/src/extractors.models.ts b/src/extractors.models.ts index 46e09a0..9795fc5 100644 --- a/src/extractors.models.ts +++ b/src/extractors.models.ts @@ -1,9 +1,11 @@ +import type { ExtractorConfig } from './types'; + export type ExtractorDefinition = ReturnType; export function defineTextExtractor(args: { name: string; mimeTypes: string[]; - extract: (args: { arrayBuffer: ArrayBuffer }) => Promise<{ content: string }>; + extract: (args: { arrayBuffer: ArrayBuffer; config: ExtractorConfig }) => Promise<{ content: string }>; }) { return args; } diff --git a/src/extractors.usecases.test.ts b/src/extractors.usecases.test.ts index 753e1a4..896ff74 100644 --- a/src/extractors.usecases.test.ts +++ b/src/extractors.usecases.test.ts @@ -4,7 +4,7 @@ import { glob } from 'tinyglobby'; import { describe, expect, test } from 'vitest'; import { extractText, extractTextFromBlob, extractTextFromFile } from './extractors.usecases'; -const fixtures = await glob(['fixtures/*', '!fixtures/*.expected']); +const fixturesDir = await glob(['fixtures/*'], { onlyDirectories: true }); describe('extractors usecases', () => { describe('extractText', () => { @@ -32,22 +32,31 @@ describe('extractors usecases', () => { }); describe('text is extracted from fixtures files', async () => { - test('at least one fixture file is found', () => { - expect(fixtures.length).to.be.greaterThan(0); + test('at least one fixture file is present', () => { + expect(fixturesDir.length).to.be.greaterThan(0); }); - for (const fixture of fixtures) { - test(`fixture ${fixture}`, async () => { - const arrayBuffer = (await fs.readFile(fixture)).buffer as ArrayBuffer; - const mimeType = mime.getType(fixture); + for (const fixture of fixturesDir) { + // use test.concurrent to run the tests in parallel -> need to use the provided expect + test.concurrent(`fixture ${fixture}`, async ({ expect }) => { + const fixtureFilesPaths = await glob([`${fixture}/*`]); + const inputFilePath = fixtureFilesPaths.find(name => name.match(/\/\d{3}\.input\.\w+$/)); + const configFilePath = fixtureFilesPaths.find(name => name.match(/\/\d{3}\.config\.ts$/)); - const { textContent, error, extractorName } = await extractText({ arrayBuffer, mimeType }); + const config = configFilePath ? (await import(configFilePath)).config : undefined; + + const arrayBuffer = (await fs.readFile(inputFilePath)).buffer as ArrayBuffer; + const mimeType = mime.getType(inputFilePath); + + const { textContent, error, extractorName } = await extractText({ arrayBuffer, mimeType, config }); expect(error).to.eql(undefined); expect(extractorName).to.not.eql(undefined); - const snapshotFilename = fixture.split('/').pop().replace(/\..*$/, '.expected'); - await expect(textContent).toMatchFileSnapshot(`../fixtures/${snapshotFilename}`, 'Fixture does not match snapshot'); + const fixtureNumber = fixture.split('/').filter(Boolean).pop().slice(0, 3); + const expectedFilePath = `../${fixture}/${fixtureNumber}.expected.txt`; + + await expect(textContent).toMatchFileSnapshot(expectedFilePath, 'Fixture does not match snapshot'); }); } }); diff --git a/src/extractors.usecases.ts b/src/extractors.usecases.ts index 5162945..d15bcb1 100644 --- a/src/extractors.usecases.ts +++ b/src/extractors.usecases.ts @@ -1,10 +1,13 @@ +import type { ExtractorConfig } from './types'; +import { parseConfig } from './config'; import { getExtractor } from './extractors.registry'; -export async function extractText({ arrayBuffer, mimeType }: { arrayBuffer: ArrayBuffer; mimeType: string }): Promise<{ +export async function extractText({ arrayBuffer, mimeType, config: rawConfig }: { arrayBuffer: ArrayBuffer; mimeType: string; config?: ExtractorConfig }): Promise<{ extractorName: string | undefined; textContent: string | undefined; error?: Error; }> { + const { config } = parseConfig({ rawConfig }); const { extractor } = getExtractor({ mimeType }); if (!extractor) { @@ -15,7 +18,7 @@ export async function extractText({ arrayBuffer, mimeType }: { arrayBuffer: Arra } try { - const { content } = await extractor.extract({ arrayBuffer }); + const { content } = await extractor.extract({ arrayBuffer, config }); return { extractorName: extractor.name, diff --git a/src/extractors/img.extractor.ts b/src/extractors/img.extractor.ts index 0638b18..1ccfe27 100644 --- a/src/extractors/img.extractor.ts +++ b/src/extractors/img.extractor.ts @@ -10,10 +10,12 @@ export const imageExtractorDefinition = defineTextExtractor({ 'image/webp', 'image/gif', ], - extract: async ({ arrayBuffer }) => { + extract: async ({ arrayBuffer, config }) => { + const { languages } = config.tesseract; + const buffer = Buffer.from(arrayBuffer); - const worker = await createWorker(); + const worker = await createWorker(languages); const { data: { text } } = await worker.recognize(buffer); await worker.terminate(); diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..73bc20d --- /dev/null +++ b/src/types.ts @@ -0,0 +1,9 @@ +import type { DeepPartial } from '@corentinth/chisels'; + +export type ExtractorConfig = { + tesseract: { + languages: string[]; + }; +}; + +export type PartialExtractorConfig = undefined | DeepPartial; diff --git a/src/types/tesseract.d.ts b/src/types/tesseract.d.ts new file mode 100644 index 0000000..26d5631 --- /dev/null +++ b/src/types/tesseract.d.ts @@ -0,0 +1,7 @@ +export * from 'tesseract.js'; + +declare module 'tesseract.js' { + type LanguageKey = 'AFR' | 'AMH' | 'ARA' | 'ASM' | 'AZE' | 'AZE_CYRL' | 'BEL' | 'BEN' | 'BOD' | 'BOS' | 'BUL' | 'CAT' | 'CEB' | 'CES' | 'CHI_SIM' | 'CHI_TRA' | 'CHR' | 'CYM' | 'DAN' | 'DEU' | 'DZO' | 'ELL' | 'ENG' | 'ENM' | 'EPO' | 'EST' | 'EUS' | 'FAS' | 'FIN' | 'FRA' | 'FRK' | 'FRM' | 'GLE' | 'GLG' | 'GRC' | 'GUJ' | 'HAT' | 'HEB' | 'HIN' | 'HRV' | 'HUN' | 'IKU' | 'IND' | 'ISL' | 'ITA' | 'ITA_OLD' | 'JAV' | 'JPN' | 'KAN' | 'KAT' | 'KAT_OLD' | 'KAZ' | 'KHM' | 'KIR' | 'KOR' | 'KUR' | 'LAO' | 'LAT' | 'LAV' | 'LIT' | 'MAL' | 'MAR' | 'MKD' | 'MLT' | 'MSA' | 'MYA' | 'NEP' | 'NLD' | 'NOR' | 'ORI' | 'PAN' | 'POL' | 'POR' | 'PUS' | 'RON' | 'RUS' | 'SAN' | 'SIN' | 'SLK' | 'SLV' | 'SPA' | 'SPA_OLD' | 'SQI' | 'SRP' | 'SRP_LATN' | 'SWA' | 'SWE' | 'SYR' | 'TAM' | 'TEL' | 'TGK' | 'TGL' | 'THA' | 'TIR' | 'TUR' | 'UIG' | 'UKR' | 'URD' | 'UZB' | 'UZB_CYRL' | 'VIE' | 'YID'; + + export const languages: Record; +}