diff --git a/apps/sim/app/api/knowledge/[id]/documents/route.test.ts b/apps/sim/app/api/knowledge/[id]/documents/route.test.ts index 70eacdf46e8..2be0e79bc52 100644 --- a/apps/sim/app/api/knowledge/[id]/documents/route.test.ts +++ b/apps/sim/app/api/knowledge/[id]/documents/route.test.ts @@ -457,11 +457,8 @@ describe('Knowledge Base Documents API Route', () => { }, ], processingOptions: { - chunkSize: 1024, - minCharactersPerChunk: 100, recipe: 'default', lang: 'en', - chunkOverlap: 200, }, } @@ -533,11 +530,8 @@ describe('Knowledge Base Documents API Route', () => { }, ], processingOptions: { - chunkSize: 50, // Invalid: too small - minCharactersPerChunk: 0, // Invalid: too small recipe: 'default', lang: 'en', - chunkOverlap: 1000, // Invalid: too large }, } diff --git a/apps/sim/app/api/knowledge/[id]/documents/route.ts b/apps/sim/app/api/knowledge/[id]/documents/route.ts index 18f7af35ac2..c65507d81f7 100644 --- a/apps/sim/app/api/knowledge/[id]/documents/route.ts +++ b/apps/sim/app/api/knowledge/[id]/documents/route.ts @@ -38,26 +38,14 @@ const CreateDocumentSchema = z.object({ documentTagsData: z.string().optional(), }) -/** - * Schema for bulk document creation with processing options - * - * Processing options units: - * - chunkSize: tokens (1 token ≈ 4 characters) - * - minCharactersPerChunk: characters - * - chunkOverlap: characters - */ const BulkCreateDocumentsSchema = z.object({ documents: z.array(CreateDocumentSchema), - processingOptions: z.object({ - /** Maximum chunk size in tokens (1 token ≈ 4 characters) */ - chunkSize: z.number().min(100).max(4000), - /** Minimum chunk size in characters */ - minCharactersPerChunk: z.number().min(1).max(2000), - recipe: z.string(), - lang: z.string(), - /** Overlap between chunks in characters */ - chunkOverlap: z.number().min(0).max(500), - }), + processingOptions: z + .object({ + recipe: z.string().optional(), + lang: z.string().optional(), + }) + .optional(), bulk: z.literal(true), }) @@ -246,8 +234,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id: knowledgeBaseId, documentsCount: createdDocuments.length, uploadType: 'bulk', - chunkSize: validatedData.processingOptions.chunkSize, - recipe: validatedData.processingOptions.recipe, + recipe: validatedData.processingOptions?.recipe, }) } catch (_e) { // Silently fail @@ -256,7 +243,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id: processDocumentsWithQueue( createdDocuments, knowledgeBaseId, - validatedData.processingOptions, + validatedData.processingOptions ?? {}, requestId ).catch((error: unknown) => { logger.error(`[${requestId}] Critical error in document processing pipeline:`, error) diff --git a/apps/sim/app/api/knowledge/[id]/documents/upsert/route.ts b/apps/sim/app/api/knowledge/[id]/documents/upsert/route.ts index 2499006ed35..1b44c7a81fe 100644 --- a/apps/sim/app/api/knowledge/[id]/documents/upsert/route.ts +++ b/apps/sim/app/api/knowledge/[id]/documents/upsert/route.ts @@ -25,13 +25,12 @@ const UpsertDocumentSchema = z.object({ fileSize: z.number().min(1, 'File size must be greater than 0'), mimeType: z.string().min(1, 'MIME type is required'), documentTagsData: z.string().optional(), - processingOptions: z.object({ - chunkSize: z.number().min(100).max(4000), - minCharactersPerChunk: z.number().min(1).max(2000), - recipe: z.string(), - lang: z.string(), - chunkOverlap: z.number().min(0).max(500), - }), + processingOptions: z + .object({ + recipe: z.string().optional(), + lang: z.string().optional(), + }) + .optional(), workflowId: z.string().optional(), }) @@ -166,7 +165,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id: processDocumentsWithQueue( createdDocuments, knowledgeBaseId, - validatedData.processingOptions, + validatedData.processingOptions ?? {}, requestId ).catch((error: unknown) => { logger.error(`[${requestId}] Critical error in document processing pipeline:`, error) @@ -178,8 +177,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id: knowledgeBaseId, documentsCount: 1, uploadType: 'single', - chunkSize: validatedData.processingOptions.chunkSize, - recipe: validatedData.processingOptions.recipe, + recipe: validatedData.processingOptions?.recipe, }) } catch (_e) { // Silently fail diff --git a/apps/sim/app/api/v1/knowledge/[id]/documents/route.ts b/apps/sim/app/api/v1/knowledge/[id]/documents/route.ts index 193111ee20c..7310a4eca98 100644 --- a/apps/sim/app/api/v1/knowledge/[id]/documents/route.ts +++ b/apps/sim/app/api/v1/knowledge/[id]/documents/route.ts @@ -187,8 +187,6 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam requestId ) - const chunkingConfig = result.kb.chunkingConfig ?? { maxSize: 1024, minSize: 100, overlap: 200 } - const documentData: DocumentData = { documentId: newDocument.id, filename: file.name, @@ -197,18 +195,7 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam mimeType: contentType, } - processDocumentsWithQueue( - [documentData], - knowledgeBaseId, - { - chunkSize: chunkingConfig.maxSize, - minCharactersPerChunk: chunkingConfig.minSize, - chunkOverlap: chunkingConfig.overlap, - recipe: 'default', - lang: 'en', - }, - requestId - ).catch(() => { + processDocumentsWithQueue([documentData], knowledgeBaseId, {}, requestId).catch(() => { // Processing errors are logged internally }) diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx index 531fcc3f175..5ddb7eb6a20 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx @@ -195,9 +195,6 @@ export function AddDocumentsModal({ try { await uploadFiles([fileToRetry], knowledgeBaseId, { - chunkSize: chunkingConfig?.maxSize || 1024, - minCharactersPerChunk: chunkingConfig?.minSize || 1, - chunkOverlap: chunkingConfig?.overlap || 200, recipe: 'default', }) removeFile(index) @@ -217,9 +214,6 @@ export function AddDocumentsModal({ try { await uploadFiles(files, knowledgeBaseId, { - chunkSize: chunkingConfig?.maxSize || 1024, - minCharactersPerChunk: chunkingConfig?.minSize || 1, - chunkOverlap: chunkingConfig?.overlap || 200, recipe: 'default', }) logger.info(`Successfully uploaded ${files.length} files`) diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/base-card/base-card.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/base-card/base-card.tsx index d956af406fa..50933913e03 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/base-card/base-card.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/base-card/base-card.tsx @@ -20,6 +20,7 @@ interface BaseCardProps { createdAt?: string updatedAt?: string connectorTypes?: string[] + chunkingConfig?: { maxSize: number; minSize: number; overlap: number } onUpdate?: (id: string, name: string, description: string) => Promise onDelete?: (id: string) => Promise } @@ -78,6 +79,7 @@ export function BaseCard({ description, updatedAt, connectorTypes = [], + chunkingConfig, onUpdate, onDelete, }: BaseCardProps) { @@ -256,6 +258,7 @@ export function BaseCard({ knowledgeBaseId={id} initialName={title} initialDescription={description === 'No description provided' ? '' : description} + chunkingConfig={chunkingConfig} onSave={handleSave} /> )} diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx index 8a89dbf93db..a4e1e44ebc9 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx @@ -269,9 +269,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({ if (files.length > 0) { try { const uploadedFiles = await uploadFiles(files, newKnowledgeBase.id, { - chunkSize: data.maxChunkSize, - minCharactersPerChunk: data.minChunkSize, - chunkOverlap: data.overlapSize, recipe: 'default', }) @@ -358,12 +355,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({ @@ -371,12 +371,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({ @@ -385,12 +388,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({

1 token ≈ 4 characters. Max chunk size and overlap are in tokens. diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/edit-knowledge-base-modal/edit-knowledge-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/edit-knowledge-base-modal/edit-knowledge-base-modal.tsx index 2850bd057be..9c32a2b644a 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/edit-knowledge-base-modal/edit-knowledge-base-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/edit-knowledge-base-modal/edit-knowledge-base-modal.tsx @@ -17,6 +17,7 @@ import { Textarea, } from '@/components/emcn' import { cn } from '@/lib/core/utils/cn' +import type { ChunkingConfig } from '@/lib/knowledge/types' const logger = createLogger('EditKnowledgeBaseModal') @@ -26,6 +27,7 @@ interface EditKnowledgeBaseModalProps { knowledgeBaseId: string initialName: string initialDescription: string + chunkingConfig?: ChunkingConfig onSave: (id: string, name: string, description: string) => Promise } @@ -49,6 +51,7 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({ knowledgeBaseId, initialName, initialDescription, + chunkingConfig, onSave, }: EditKnowledgeBaseModalProps) { const [isSubmitting, setIsSubmitting] = useState(false) @@ -137,6 +140,47 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({

)} + + {chunkingConfig && ( +
+ +
+
+

+ Max Size +

+

+ {chunkingConfig.maxSize.toLocaleString()} + + tokens + +

+
+
+

+ Min Size +

+

+ {chunkingConfig.minSize.toLocaleString()} + + chars + +

+
+
+

+ Overlap +

+

+ {chunkingConfig.overlap.toLocaleString()} + + tokens + +

+
+
+
+ )} diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/hooks/use-knowledge-upload.ts b/apps/sim/app/workspace/[workspaceId]/knowledge/hooks/use-knowledge-upload.ts index 5dcc75ef4b7..265f3f0c7f4 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/hooks/use-knowledge-upload.ts +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/hooks/use-knowledge-upload.ts @@ -46,9 +46,6 @@ export interface UploadError { } export interface ProcessingOptions { - chunkSize?: number - minCharactersPerChunk?: number - chunkOverlap?: number recipe?: string } @@ -1011,10 +1008,7 @@ export function useKnowledgeUpload(options: UseKnowledgeUploadOptions = {}) { ...file, })), processingOptions: { - chunkSize: processingOptions.chunkSize || 1024, - minCharactersPerChunk: processingOptions.minCharactersPerChunk || 1, - chunkOverlap: processingOptions.chunkOverlap || 200, - recipe: processingOptions.recipe || 'default', + recipe: processingOptions.recipe ?? 'default', lang: 'en', }, bulk: true, diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/knowledge.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/knowledge.tsx index f844fa9628e..fcdd8053abc 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/knowledge.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/knowledge.tsx @@ -602,6 +602,7 @@ export function Knowledge() { knowledgeBaseId={activeKnowledgeBase.id} initialName={activeKnowledgeBase.name} initialDescription={activeKnowledgeBase.description || ''} + chunkingConfig={activeKnowledgeBase.chunkingConfig} onSave={handleUpdateKnowledgeBase} /> )} diff --git a/apps/sim/background/knowledge-processing.ts b/apps/sim/background/knowledge-processing.ts index 8f7d75c4284..5f20d5af285 100644 --- a/apps/sim/background/knowledge-processing.ts +++ b/apps/sim/background/knowledge-processing.ts @@ -15,11 +15,8 @@ export type DocumentProcessingPayload = { mimeType: string } processingOptions: { - chunkSize?: number - minCharactersPerChunk?: number recipe?: string lang?: string - chunkOverlap?: number } requestId: string } diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts index 1c991b850f7..f36b47d96c7 100644 --- a/apps/sim/lib/knowledge/documents/service.ts +++ b/apps/sim/lib/knowledge/documents/service.ts @@ -101,11 +101,8 @@ export interface DocumentData { } export interface ProcessingOptions { - chunkSize?: number - minCharactersPerChunk?: number recipe?: string lang?: string - chunkOverlap?: number } export interface DocumentJobData { @@ -416,13 +413,7 @@ export async function processDocumentAsync( fileSize: number mimeType: string }, - processingOptions: { - chunkSize?: number - minCharactersPerChunk?: number - recipe?: string - lang?: string - chunkOverlap?: number - } + processingOptions: ProcessingOptions = {} ): Promise { const startTime = Date.now() try { @@ -456,7 +447,16 @@ export async function processDocumentAsync( logger.info(`[${documentId}] Status updated to 'processing', starting document processor`) - const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number } + const rawConfig = kb[0].chunkingConfig as { + maxSize?: number + minSize?: number + overlap?: number + } | null + const kbConfig = { + maxSize: rawConfig?.maxSize ?? 1024, + minSize: rawConfig?.minSize ?? 100, + overlap: rawConfig?.overlap ?? 200, + } await withTimeout( (async () => { @@ -464,9 +464,9 @@ export async function processDocumentAsync( docData.fileUrl, docData.filename, docData.mimeType, - processingOptions.chunkSize ?? kbConfig.maxSize, - processingOptions.chunkOverlap ?? kbConfig.overlap, - processingOptions.minCharactersPerChunk ?? kbConfig.minSize, + kbConfig.maxSize, + kbConfig.overlap, + kbConfig.minSize, kb[0].userId, kb[0].workspaceId ) @@ -1573,16 +1573,6 @@ export async function retryDocumentProcessing( }, requestId: string ): Promise<{ success: boolean; status: string; message: string }> { - const kb = await db - .select({ - chunkingConfig: knowledgeBase.chunkingConfig, - }) - .from(knowledgeBase) - .where(eq(knowledgeBase.id, knowledgeBaseId)) - .limit(1) - - const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number } - await db.transaction(async (tx) => { await tx.delete(embedding).where(eq(embedding.documentId, documentId)) @@ -1600,14 +1590,6 @@ export async function retryDocumentProcessing( .where(eq(document.id, documentId)) }) - const processingOptions = { - chunkSize: kbConfig.maxSize, - minCharactersPerChunk: kbConfig.minSize, - recipe: 'default', - lang: 'en', - chunkOverlap: kbConfig.overlap, - } - await processDocumentsWithQueue( [ { @@ -1619,7 +1601,7 @@ export async function retryDocumentProcessing( }, ], knowledgeBaseId, - processingOptions, + {}, requestId ) diff --git a/apps/sim/tools/knowledge/create_document.ts b/apps/sim/tools/knowledge/create_document.ts index e209a0e9bd2..feaf8ac3960 100644 --- a/apps/sim/tools/knowledge/create_document.ts +++ b/apps/sim/tools/knowledge/create_document.ts @@ -103,9 +103,6 @@ export const knowledgeCreateDocumentTool: ToolConfig