Perf: read file woker (#1337)

* perf: read file worker * fix: Http node url input * fix: htm2md * fix: html2md * fix: ts * perf: Problem classification increases the matching order * feat: tool response answer
labring · Apr 30, 2024 · b5f0ac3 · b5f0ac3
1 parent 1529c1e
commit b5f0ac3
Show file tree

Hide file tree

Showing 35 changed files with 413 additions and 398 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bugs.md b/.github/ISSUE_TEMPLATE/bugs.md
@@ -21,7 +21,7 @@ assignees: ''
 - [ ] 公有云版本
 - [ ] 私有部署版本, 具体版本号: 
 
-**问题描述**
+**问题描述, 日志截图**
 
 **复现步骤**
 

diff --git a/packages/global/core/workflow/template/system/tools.ts b/packages/global/core/workflow/template/system/tools.ts
@@ -64,5 +64,14 @@ export const ToolModule: FlowNodeTemplateType = {
     Input_Template_History,
     Input_Template_UserChatInput
   ],
-  outputs: []
+  outputs: [
+    {
+      id: NodeOutputKeyEnum.answerText,
+      key: NodeOutputKeyEnum.answerText,
+      label: 'core.module.output.label.Ai response content',
+      description: 'core.module.output.description.Ai response content',
+      valueType: WorkflowIOValueTypeEnum.string,
+      type: FlowNodeOutputTypeEnum.static
+    }
+  ]
 };
diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts
@@ -6,7 +6,6 @@ import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoFileSchema } from './schema';
 import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
-import { ReadFileByBufferParams } from '../read/type';
 import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
 import { readFileRawContent } from '../read/utils';
 import { PassThrough } from 'stream';
@@ -197,19 +196,15 @@ export const readFileContentFromMongo = async ({
     });
   })();
 
-  const params: ReadFileByBufferParams = {
+  const { rawText } = await readFileRawContent({
+    extension,
+    csvFormat,
     teamId,
     buffer: fileBuffers,
     encoding,
     metadata: {
       relatedId: fileId
     }
-  };
-
-  const { rawText } = await readFileRawContent({
-    extension,
-    csvFormat,
-    params
   });
 
   if (rawText.trim()) {

diff --git a/packages/service/common/file/read/html.ts b/packages/service/common/file/read/html.ts
diff --git a/packages/service/common/file/read/markdown.ts b/packages/service/common/file/read/markdown.ts
diff --git a/packages/service/common/file/read/type.d.ts b/packages/service/common/file/read/type.d.ts
diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts
@@ -1,16 +1,10 @@
-import { markdownProcess } from '@fastgpt/global/common/string/markdown';
+import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
 import { uploadMongoImg } from '../image/controller';
 import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
 import { addHours } from 'date-fns';
-import { ReadFileByBufferParams } from './type';
-import { readFileRawText } from '../read/rawText';
-import { readMarkdown } from '../read/markdown';
-import { readHtmlRawText } from '../read/html';
-import { readPdfFile } from '../read/pdf';
-import { readWordFile } from '../read/word';
-import { readCsvRawText } from '../read/csv';
-import { readPptxRawText } from '../read/pptx';
-import { readXlsxRawText } from '../read/xlsx';
+
+import { WorkerNameEnum, runWorker } from '../../../worker/utils';
+import { ReadFileResponse } from '../../../worker/file/type';
 
 export const initMarkdownText = ({
   teamId,
@@ -36,46 +30,39 @@ export const initMarkdownText = ({
 export const readFileRawContent = async ({
   extension,
   csvFormat,
-  params
+  teamId,
+  buffer,
+  encoding,
+  metadata
 }: {
   csvFormat?: boolean;
   extension: string;
-  params: ReadFileByBufferParams;
+  teamId: string;
+  buffer: Buffer;
+  encoding: string;
+  metadata?: Record<string, any>;
 }) => {
-  switch (extension) {
-    case 'txt':
-      return readFileRawText(params);
-    case 'md':
-      return readMarkdown(params);
-    case 'html':
-      return readHtmlRawText(params);
-    case 'pdf':
-      return readPdfFile(params);
-    case 'docx':
-      return readWordFile(params);
-    case 'pptx':
-      return readPptxRawText(params);
-    case 'xlsx':
-      const xlsxResult = await readXlsxRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: xlsxResult.formatText || ''
-        };
-      }
-      return {
-        rawText: xlsxResult.rawText
-      };
-    case 'csv':
-      const csvResult = await readCsvRawText(params);
-      if (csvFormat) {
-        return {
-          rawText: csvResult.formatText || ''
-        };
-      }
-      return {
-        rawText: csvResult.rawText
-      };
-    default:
-      return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+  const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
+    extension,
+    csvFormat,
+    encoding,
+    buffer
+  });
+
+  // markdown data format
+  if (['md', 'html', 'docx'].includes(extension)) {
+    result.rawText = await initMarkdownText({
+      teamId: teamId,
+      md: result.rawText,
+      metadata: metadata
+    });
   }
+
+  return result;
+};
+
+export const htmlToMarkdown = async (html?: string | null) => {
+  const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
+
+  return simpleMarkdownText(md);
 };
diff --git a/packages/service/common/file/read/word.ts b/packages/service/common/file/read/word.ts
diff --git a/packages/service/common/string/cheerio.ts b/packages/service/common/string/cheerio.ts
@@ -1,7 +1,7 @@
 import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
 import * as cheerio from 'cheerio';
 import axios from 'axios';
-import { htmlToMarkdown } from './markdown';
+import { htmlToMarkdown } from '../file/read/utils';
 
 export const cheerioToHtml = ({
   fetchUrl,
@@ -77,7 +77,9 @@ export const urlsFetch = async ({
           $,
           selector
         });
+        console.log('html====', html);
         const md = await htmlToMarkdown(html);
+        console.log('html====', md);
 
         return {
           url,

diff --git a/packages/service/common/string/markdown.ts b/packages/service/common/string/markdown.ts
diff --git a/packages/service/common/vectorStore/pg/controller.ts b/packages/service/common/vectorStore/pg/controller.ts
@@ -23,7 +23,7 @@ export async function initPg() {
     `);
 
     await PgClient.query(
-      `CREATE INDEX CONCURRENTLY IF NOT EXISTS vector_index ON ${PgDatasetTableName} USING hnsw (vector vector_ip_ops) WITH (m = 32, ef_construction = 64);`
+      `CREATE INDEX CONCURRENTLY IF NOT EXISTS vector_index ON ${PgDatasetTableName} USING hnsw (vector vector_ip_ops) WITH (m = 32, ef_construction = 100);`
     );
     await PgClient.query(
       `CREATE INDEX CONCURRENTLY IF NOT EXISTS team_dataset_collection_index ON ${PgDatasetTableName} USING btree(team_id, dataset_id, collection_id);`

diff --git a/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts b/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts
@@ -131,7 +131,9 @@ const completions = async ({
   console.log(answer, '----');
 
   const id =
-    agents.find((item) => answer.includes(item.key) || answer.includes(item.value))?.key || '';
+    agents.find((item) => answer.includes(item.key))?.key ||
+    agents.find((item) => answer.includes(item.value))?.key ||
+    '';
 
   return {
     tokens: await countMessagesTokens(messages),

diff --git a/packages/service/core/workflow/dispatch/agent/runTool/index.ts b/packages/service/core/workflow/dispatch/agent/runTool/index.ts
@@ -23,7 +23,9 @@ import { runToolWithPromptCall } from './promptCall';
 import { replaceVariable } from '@fastgpt/global/common/string/tools';
 import { Prompt_Tool_Call } from './constants';
 
-type Response = DispatchNodeResultType<{}>;
+type Response = DispatchNodeResultType<{
+  [NodeOutputKeyEnum.answerText]: string;
+}>;
 
 export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<Response> => {
   const {
@@ -129,6 +131,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
   const flatUsages = dispatchFlowResponse.map((item) => item.flowUsages).flat();
 
   return {
+    [NodeOutputKeyEnum.answerText]: assistantResponses
+      .filter((item) => item.text?.content)
+      .map((item) => item.text?.content || '')
+      .join(''),
     [DispatchNodeResponseKeyEnum.assistantResponses]: assistantResponses,
     [DispatchNodeResponseKeyEnum.nodeResponse]: {
       totalPoints: totalPointsUsage,

diff --git a/packages/service/core/workflow/dispatch/index.ts b/packages/service/core/workflow/dispatch/index.ts
@@ -142,10 +142,8 @@ export async function dispatchWorkFlow({
     }
     if (assistantResponses) {
       chatAssistantResponse = chatAssistantResponse.concat(assistantResponses);
-    }
-
-    // save assistant text response
-    if (answerText) {
+    } else if (answerText) {
+      // save assistant text response
       const isResponseAnswerText =
         inputs.find((item) => item.key === NodeInputKeyEnum.aiChatIsResponseText)?.value ?? true;
       if (isResponseAnswerText) {

diff --git a/packages/service/core/workflow/dispatch/tools/answer.ts b/packages/service/core/workflow/dispatch/tools/answer.ts
@@ -19,24 +19,24 @@ export const dispatchAnswer = (props: Record<string, any>): AnswerResponse => {
     res,
     detail,
     stream,
-    node: { name },
     params: { text = '' }
   } = props as AnswerProps;
 
   const formatText = typeof text === 'string' ? text : JSON.stringify(text, null, 2);
+  const responseText = `\n${formatText}`;
 
   if (res && stream) {
     responseWrite({
       res,
       event: detail ? SseResponseEventEnum.fastAnswer : undefined,
       data: textAdaptGptResponse({
-        text: `\n${formatText}`
+        text: responseText
       })
     });
   }
 
   return {
-    [NodeOutputKeyEnum.answerText]: formatText,
+    [NodeOutputKeyEnum.answerText]: responseText,
     [DispatchNodeResponseKeyEnum.nodeResponse]: {
       textOutput: formatText
     }

diff --git a/packages/service/common/file/read/csv.ts → ...ages/service/worker/file/extension/csv.ts b/packages/service/common/file/read/csv.ts → ...ages/service/worker/file/extension/csv.ts
@@ -1,9 +1,9 @@
 import Papa from 'papaparse';
-import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
 import { readFileRawText } from './rawText';
 
 // 加载源文件内容
-export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
   const { rawText } = readFileRawText(params);
 
   const csvArr = Papa.parse(rawText).data as string[][];

diff --git a/packages/service/worker/file/extension/docx.ts b/packages/service/worker/file/extension/docx.ts
@@ -0,0 +1,23 @@
+import mammoth from 'mammoth';
+import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
+import { html2md } from '../../htmlStr2Md/utils';
+
+/**
+ * read docx to markdown
+ */
+export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
+  try {
+    const { value: html } = await mammoth.convertToHtml({
+      buffer
+    });
+
+    const rawText = html2md(html);
+
+    return {
+      rawText
+    };
+  } catch (error) {
+    console.log('error doc read:', error);
+    return Promise.reject('Can not read doc file, please convert to PDF');
+  }
+};