Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 94 additions & 10 deletions src/main/knowledge/preprocess/MineruPreprocessProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ import path from 'node:path'

import { loggerService } from '@logger'
import { fileStorage } from '@main/services/FileStorage'
import { FileMetadata, PreprocessProvider } from '@types'
import { imageExts } from '@shared/config/constant'
import { FileMetadata, FileTypes, PreprocessProvider } from '@types'
import AdmZip from 'adm-zip'
import { net } from 'electron'
import { PDFDocument } from 'pdf-lib'
import sharp from 'sharp'

import BasePreprocessProvider from './BasePreprocessProvider'

Expand Down Expand Up @@ -63,33 +66,54 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider {
sourceId: string,
file: FileMetadata
): Promise<{ processedFile: FileMetadata; quota: number }> {
let cleanupPath: string | null = null
try {
const filePath = fileStorage.getFilePathById(file)
logger.info(`MinerU preprocess processing started: ${filePath}`)
await this.validateFile(filePath)
const originalPath = fileStorage.getFilePathById(file)
logger.info(`MinerU preprocess processing started: ${originalPath}`)

let workingFile: FileMetadata = file
let workingFilePath = originalPath

if (this.isImageFile(file)) {
const converted = await this.convertImageToPdf(file, originalPath)
workingFile = converted.metadata
workingFilePath = converted.path
cleanupPath = converted.path
logger.info(`Converted image to PDF for MinerU preprocessing: ${workingFilePath}`)
}

await this.validateFile(workingFilePath)

// 1. 获取上传URL并上传文件
const batchId = await this.uploadFile(file)
const batchId = await this.uploadFile(workingFile, workingFilePath)
logger.info(`MinerU file upload completed: batch_id=${batchId}`)

// 2. 等待处理完成并获取结果
const extractResult = await this.waitForCompletion(sourceId, batchId, file.origin_name)
const extractResult = await this.waitForCompletion(sourceId, batchId, workingFile.origin_name)
logger.info(`MinerU processing completed for batch: ${batchId}`)

// 3. 下载并解压文件
const { path: outputPath } = await this.downloadAndExtractFile(extractResult.full_zip_url!, file)
const { path: outputPath } = await this.downloadAndExtractFile(extractResult.full_zip_url!, workingFile)

// 4. check quota
const quota = await this.checkQuota()

// 5. 创建处理后的文件信息
return {
processedFile: this.createProcessedFileInfo(file, outputPath),
processedFile: this.createProcessedFileInfo(workingFile, outputPath),
quota
}
} catch (error: any) {
logger.error(`MinerU preprocess processing failed for:`, error as Error)
throw new Error(error.message)
} finally {
if (cleanupPath) {
try {
await fs.promises.unlink(cleanupPath)
} catch (cleanupError) {
logger.warn(`Failed to cleanup temporary MinerU conversion file ${cleanupPath}:`, cleanupError as Error)
}
}
}
}

Expand Down Expand Up @@ -207,12 +231,11 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider {
}
}

private async uploadFile(file: FileMetadata): Promise<string> {
private async uploadFile(file: FileMetadata, filePath: string): Promise<string> {
try {
// 步骤1: 获取上传URL
const { batchId, fileUrls } = await this.getBatchUploadUrls(file)
// 步骤2: 上传文件到获取的URL
const filePath = fileStorage.getFilePathById(file)
await this.putFileToUrl(filePath, fileUrls[0])
logger.info(`File uploaded successfully: ${filePath}`, { batchId, fileUrls })

Expand All @@ -223,6 +246,67 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider {
}
}

private isImageFile(file: FileMetadata): boolean {
const ext = (file.ext || '').toLowerCase()
return imageExts.includes(ext)
}

private async convertImageToPdf(
file: FileMetadata,
sourcePath: string
): Promise<{ metadata: FileMetadata; path: string }> {
try {
const ext = (file.ext || '').toLowerCase()
const imageBuffer = await fs.promises.readFile(sourcePath)

let convertedBuffer: Buffer
let embedType: 'jpg' | 'png'

if (ext === '.jpg' || ext === '.jpeg') {
convertedBuffer = imageBuffer
embedType = 'jpg'
} else {
convertedBuffer = await sharp(imageBuffer).png().toBuffer()
embedType = 'png'
}

const pdfDoc = await PDFDocument.create()
const embeddedImage =
embedType === 'jpg' ? await pdfDoc.embedJpg(convertedBuffer) : await pdfDoc.embedPng(convertedBuffer)
const { width, height } = embeddedImage

const page = pdfDoc.addPage([width, height])
page.drawImage(embeddedImage, {
x: 0,
y: 0,
width,
height
})

const pdfBytes = await pdfDoc.save()
const pdfPath = path.join(this.storageDir, `${file.id}-mineru.pdf`)
await fs.promises.writeFile(pdfPath, pdfBytes)

const originNameBase = path.parse(file.origin_name).name || file.origin_name

const metadata: FileMetadata = {
...file,
path: pdfPath,
size: pdfBytes.length,
ext: '.pdf',
origin_name: `${originNameBase}.pdf`,
name: `${file.id}.pdf`,
type: FileTypes.DOCUMENT,
created_at: new Date().toISOString()
}

return { metadata, path: pdfPath }
} catch (error: any) {
logger.error(`Failed to convert image ${file.origin_name} to PDF: ${error.message}`)
throw new Error(`Failed to convert image to PDF: ${error.message}`)
}
}

private async getBatchUploadUrls(file: FileMetadata): Promise<{ batchId: string; fileUrls: string[] }> {
const endpoint = `${this.provider.apiHost}/api/v4/file-urls/batch`

Expand Down
52 changes: 31 additions & 21 deletions src/main/services/KnowledgeService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* @see {@link ../../../docs/technical/KnowledgeService.md}
*/

import * as fs from 'node:fs'

Check failure on line 16 in src/main/services/KnowledgeService.ts

View workflow job for this annotation

GitHub Actions / build

Run autofix to sort these imports!
import path from 'node:path'

import { RAGApplication, RAGApplicationBuilder } from '@cherrystudio/embedjs'
Expand All @@ -31,10 +31,10 @@
import { getDataPath } from '@main/utils'
import { getAllFiles, sanitizeFilename } from '@main/utils/file'
import { TraceMethod } from '@mcp-trace/trace-core'
import { MB } from '@shared/config/constant'
import { MB, imageExts } from '@shared/config/constant'
import type { LoaderReturn } from '@shared/config/types'
import { IpcChannel } from '@shared/IpcChannel'
import { FileMetadata, KnowledgeBaseParams, KnowledgeItem, KnowledgeSearchResult } from '@types'
import { FileMetadata, FileTypes, KnowledgeBaseParams, KnowledgeItem, KnowledgeSearchResult } from '@types'
import { v4 as uuidv4 } from 'uuid'

const logger = loggerService.withContext('MainKnowledgeService')
Expand Down Expand Up @@ -298,6 +298,12 @@
this.workload >= KnowledgeService.MAXIMUM_WORKLOAD
)
}

private isImageFile(file: FileMetadata): boolean {
const ext = (file.ext || '').toLowerCase()
return file.type === FileTypes.IMAGE || imageExts.some((imageExt) => imageExt === ext)
}

private fileTask(
ragApplication: RAGApplication,
options: KnowledgeBaseAddItemOptionsNonNullableAttribute
Expand All @@ -313,23 +319,20 @@
try {
// Add preprocessing logic
const fileToProcess: FileMetadata = await this.preprocessing(file, base, item, userId)

// Use processed file for loading
return addFileLoader(ragApplication, fileToProcess, base, forceReload)
.then((result) => {
loaderTask.loaderDoneReturn = result
return result
})
.catch((e) => {
logger.error(`Error in addFileLoader for ${file.name}: ${e}`)
const errorResult: LoaderReturn = {
...KnowledgeService.ERROR_LOADER_RETURN,
message: e.message,
messageSource: 'embedding'
}
loaderTask.loaderDoneReturn = errorResult
return errorResult
})
try {
const result = await addFileLoader(ragApplication, fileToProcess, base, forceReload)
loaderTask.loaderDoneReturn = result
return result
} catch (e: any) {
logger.error(`Error in addFileLoader for ${file.name}: ${e}`)
const errorResult: LoaderReturn = {
...KnowledgeService.ERROR_LOADER_RETURN,
message: e.message,
messageSource: 'embedding'
}
loaderTask.loaderDoneReturn = errorResult
return errorResult
}
} catch (e: any) {
logger.error(`Preprocessing failed for ${file.name}: ${e}`)
const errorResult: LoaderReturn = {
Expand Down Expand Up @@ -692,9 +695,16 @@
userId: string
): Promise<FileMetadata> => {
let fileToProcess: FileMetadata = file
if (base.preprocessProvider && file.ext.toLowerCase() === '.pdf') {
const ext = (file.ext || '').toLowerCase()
const preprocessConfig = base.preprocessProvider
const providerId = preprocessConfig?.provider.id
const supportsImagePreprocess =
this.isImageFile(file) && Boolean(providerId && ['mineru', 'open-mineru'].includes(providerId))
const shouldUsePreprocess = Boolean(preprocessConfig && (ext === '.pdf' || supportsImagePreprocess))

if (shouldUsePreprocess && preprocessConfig) {
try {
const provider = new PreprocessProvider(base.preprocessProvider.provider, userId)
const provider = new PreprocessProvider(preprocessConfig.provider, userId)
const filePath = fileStorage.getFilePathById(file)
// Check if file has already been preprocessed
const alreadyProcessed = await provider.checkIfAlreadyProcessed(file)
Expand Down
6 changes: 3 additions & 3 deletions src/renderer/src/pages/knowledge/items/KnowledgeFiles.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import FileManager from '@renderer/services/FileManager'
import { getProviderName } from '@renderer/services/ProviderService'
import { FileMetadata, FileTypes, isKnowledgeFileItem, KnowledgeBase, KnowledgeItem } from '@renderer/types'
import { formatFileSize, uuid } from '@renderer/utils'
import { bookExts, documentExts, textExts, thirdPartyApplicationExts } from '@shared/config/constant'
import { bookExts, documentExts, imageExts, textExts, thirdPartyApplicationExts } from '@shared/config/constant'
import { Button, Tooltip, Upload } from 'antd'
import dayjs from 'dayjs'
import { FC, useCallback, useEffect, useState } from 'react'
Expand Down Expand Up @@ -40,7 +40,7 @@ interface KnowledgeContentProps {
preprocessMap: Map<string, boolean>
}

const fileTypes = [...bookExts, ...thirdPartyApplicationExts, ...documentExts, ...textExts]
const fileTypes = [...bookExts, ...thirdPartyApplicationExts, ...documentExts, ...textExts, ...imageExts]

const getDisplayTime = (item: KnowledgeItem) => {
const timestamp = item.updated_at && item.updated_at > item.created_at ? item.updated_at : item.created_at
Expand Down Expand Up @@ -165,7 +165,7 @@ const KnowledgeFiles: FC<KnowledgeContentProps> = ({ selectedBase, progressMap,
openFileDialogOnClick={false}>
<p className="ant-upload-text">{t('knowledge.drag_file')}</p>
<p className="ant-upload-hint">
{t('knowledge.file_hint', { file_types: 'TXT, MD, HTML, PDF, DOCX, PPTX, XLSX, EPUB...' })}
{t('knowledge.file_hint', { file_types: 'TXT, MD, HTML, PDF, DOCX, PPTX, XLSX, EPUB, PNG, JPG...' })}
</p>
</Dragger>
</div>
Expand Down
13 changes: 9 additions & 4 deletions src/renderer/src/queue/KnowledgeQueue.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { loggerService } from '@logger'

Check failure on line 1 in src/renderer/src/queue/KnowledgeQueue.ts

View workflow job for this annotation

GitHub Actions / build

Run autofix to sort these imports!
import db from '@renderer/databases'
import { getStoreSetting } from '@renderer/hooks/useSettings'
import { getKnowledgeBaseParams } from '@renderer/services/KnowledgeService'
Expand All @@ -10,7 +10,7 @@
updateBaseItemUniqueId,
updateItemProcessingStatus
} from '@renderer/store/knowledge'
import { KnowledgeItem } from '@renderer/types'
import { FileTypes, KnowledgeItem, isKnowledgeFileItem } from '@renderer/types'
import { uuid } from '@renderer/utils'
import type { LoaderReturn } from '@shared/config/types'
import { t } from 'i18next'
Expand Down Expand Up @@ -127,11 +127,16 @@
throw new Error(`[KnowledgeQueue] Source item ${item.id} not found in base ${baseId}`)
}

let result: LoaderReturn | null = null
let note, content
let result: LoaderReturn | null = null
let note, content

logger.info(`Processing item: ${sourceItem.content}`)

const isImageItem = isKnowledgeFileItem(sourceItem) && sourceItem.content.type === FileTypes.IMAGE
const preprocessProviderId = base.preprocessProvider?.provider.id
const shouldUsePreprocessForImage =
isImageItem && Boolean(preprocessProviderId && ['mineru', 'open-mineru'].includes(preprocessProviderId))

switch (item.type) {
case 'note':
note = await db.knowledge_notes.get(item.id)
Expand Down Expand Up @@ -201,7 +206,7 @@
updateBaseItemIsPreprocessed({
baseId,
itemId: item.id,
isPreprocessed: !!base.preprocessProvider
isPreprocessed: shouldUsePreprocessForImage ? true : !!base.preprocessProvider
})
)
}
Expand Down
Loading