Skip to content

Commit

Permalink
Perf: read file woker (#1337)
Browse files Browse the repository at this point in the history
* perf: read file worker

* fix: Http node url input

* fix: htm2md

* fix: html2md

* fix: ts

* perf: Problem classification increases the matching order

* feat: tool response answer
  • Loading branch information
c121914yu committed Apr 30, 2024
1 parent 1529c1e commit b5f0ac3
Show file tree
Hide file tree
Showing 35 changed files with 413 additions and 398 deletions.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bugs.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ assignees: ''
- [ ] 公有云版本
- [ ] 私有部署版本, 具体版本号:

**问题描述**
**问题描述, 日志截图**

**复现步骤**

Expand Down
11 changes: 10 additions & 1 deletion packages/global/core/workflow/template/system/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,14 @@ export const ToolModule: FlowNodeTemplateType = {
Input_Template_History,
Input_Template_UserChatInput
],
outputs: []
outputs: [
{
id: NodeOutputKeyEnum.answerText,
key: NodeOutputKeyEnum.answerText,
label: 'core.module.output.label.Ai response content',
description: 'core.module.output.description.Ai response content',
valueType: WorkflowIOValueTypeEnum.string,
type: FlowNodeOutputTypeEnum.static
}
]
};
11 changes: 3 additions & 8 deletions packages/service/common/file/gridfs/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { ReadFileByBufferParams } from '../read/type';
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
import { readFileRawContent } from '../read/utils';
import { PassThrough } from 'stream';
Expand Down Expand Up @@ -197,19 +196,15 @@ export const readFileContentFromMongo = async ({
});
})();

const params: ReadFileByBufferParams = {
const { rawText } = await readFileRawContent({
extension,
csvFormat,
teamId,
buffer: fileBuffers,
encoding,
metadata: {
relatedId: fileId
}
};

const { rawText } = await readFileRawContent({
extension,
csvFormat,
params
});

if (rawText.trim()) {
Expand Down
23 changes: 0 additions & 23 deletions packages/service/common/file/read/html.ts

This file was deleted.

18 changes: 0 additions & 18 deletions packages/service/common/file/read/markdown.ts

This file was deleted.

12 changes: 0 additions & 12 deletions packages/service/common/file/read/type.d.ts

This file was deleted.

81 changes: 34 additions & 47 deletions packages/service/common/file/read/utils.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
import { uploadMongoImg } from '../image/controller';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
import { addHours } from 'date-fns';
import { ReadFileByBufferParams } from './type';
import { readFileRawText } from '../read/rawText';
import { readMarkdown } from '../read/markdown';
import { readHtmlRawText } from '../read/html';
import { readPdfFile } from '../read/pdf';
import { readWordFile } from '../read/word';
import { readCsvRawText } from '../read/csv';
import { readPptxRawText } from '../read/pptx';
import { readXlsxRawText } from '../read/xlsx';

import { WorkerNameEnum, runWorker } from '../../../worker/utils';
import { ReadFileResponse } from '../../../worker/file/type';

export const initMarkdownText = ({
teamId,
Expand All @@ -36,46 +30,39 @@ export const initMarkdownText = ({
export const readFileRawContent = async ({
extension,
csvFormat,
params
teamId,
buffer,
encoding,
metadata
}: {
csvFormat?: boolean;
extension: string;
params: ReadFileByBufferParams;
teamId: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;
}) => {
switch (extension) {
case 'txt':
return readFileRawText(params);
case 'md':
return readMarkdown(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readWordFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
const xlsxResult = await readXlsxRawText(params);
if (csvFormat) {
return {
rawText: xlsxResult.formatText || ''
};
}
return {
rawText: xlsxResult.rawText
};
case 'csv':
const csvResult = await readCsvRawText(params);
if (csvFormat) {
return {
rawText: csvResult.formatText || ''
};
}
return {
rawText: csvResult.rawText
};
default:
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
extension,
csvFormat,
encoding,
buffer
});

// markdown data format
if (['md', 'html', 'docx'].includes(extension)) {
result.rawText = await initMarkdownText({
teamId: teamId,
md: result.rawText,
metadata: metadata
});
}

return result;
};

export const htmlToMarkdown = async (html?: string | null) => {
const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });

return simpleMarkdownText(md);
};
35 changes: 0 additions & 35 deletions packages/service/common/file/read/word.ts

This file was deleted.

4 changes: 3 additions & 1 deletion packages/service/common/string/cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
import * as cheerio from 'cheerio';
import axios from 'axios';
import { htmlToMarkdown } from './markdown';
import { htmlToMarkdown } from '../file/read/utils';

export const cheerioToHtml = ({
fetchUrl,
Expand Down Expand Up @@ -77,7 +77,9 @@ export const urlsFetch = async ({
$,
selector
});
console.log('html====', html);
const md = await htmlToMarkdown(html);
console.log('html====', md);

return {
url,
Expand Down
9 changes: 0 additions & 9 deletions packages/service/common/string/markdown.ts

This file was deleted.

2 changes: 1 addition & 1 deletion packages/service/common/vectorStore/pg/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ export async function initPg() {
`);

await PgClient.query(
`CREATE INDEX CONCURRENTLY IF NOT EXISTS vector_index ON ${PgDatasetTableName} USING hnsw (vector vector_ip_ops) WITH (m = 32, ef_construction = 64);`
`CREATE INDEX CONCURRENTLY IF NOT EXISTS vector_index ON ${PgDatasetTableName} USING hnsw (vector vector_ip_ops) WITH (m = 32, ef_construction = 100);`
);
await PgClient.query(
`CREATE INDEX CONCURRENTLY IF NOT EXISTS team_dataset_collection_index ON ${PgDatasetTableName} USING btree(team_id, dataset_id, collection_id);`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ const completions = async ({
console.log(answer, '----');

const id =
agents.find((item) => answer.includes(item.key) || answer.includes(item.value))?.key || '';
agents.find((item) => answer.includes(item.key))?.key ||
agents.find((item) => answer.includes(item.value))?.key ||
'';

return {
tokens: await countMessagesTokens(messages),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ import { runToolWithPromptCall } from './promptCall';
import { replaceVariable } from '@fastgpt/global/common/string/tools';
import { Prompt_Tool_Call } from './constants';

type Response = DispatchNodeResultType<{}>;
type Response = DispatchNodeResultType<{
[NodeOutputKeyEnum.answerText]: string;
}>;

export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<Response> => {
const {
Expand Down Expand Up @@ -129,6 +131,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
const flatUsages = dispatchFlowResponse.map((item) => item.flowUsages).flat();

return {
[NodeOutputKeyEnum.answerText]: assistantResponses
.filter((item) => item.text?.content)
.map((item) => item.text?.content || '')
.join(''),
[DispatchNodeResponseKeyEnum.assistantResponses]: assistantResponses,
[DispatchNodeResponseKeyEnum.nodeResponse]: {
totalPoints: totalPointsUsage,
Expand Down
6 changes: 2 additions & 4 deletions packages/service/core/workflow/dispatch/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,8 @@ export async function dispatchWorkFlow({
}
if (assistantResponses) {
chatAssistantResponse = chatAssistantResponse.concat(assistantResponses);
}

// save assistant text response
if (answerText) {
} else if (answerText) {
// save assistant text response
const isResponseAnswerText =
inputs.find((item) => item.key === NodeInputKeyEnum.aiChatIsResponseText)?.value ?? true;
if (isResponseAnswerText) {
Expand Down
6 changes: 3 additions & 3 deletions packages/service/core/workflow/dispatch/tools/answer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,24 @@ export const dispatchAnswer = (props: Record<string, any>): AnswerResponse => {
res,
detail,
stream,
node: { name },
params: { text = '' }
} = props as AnswerProps;

const formatText = typeof text === 'string' ? text : JSON.stringify(text, null, 2);
const responseText = `\n${formatText}`;

if (res && stream) {
responseWrite({
res,
event: detail ? SseResponseEventEnum.fastAnswer : undefined,
data: textAdaptGptResponse({
text: `\n${formatText}`
text: responseText
})
});
}

return {
[NodeOutputKeyEnum.answerText]: formatText,
[NodeOutputKeyEnum.answerText]: responseText,
[DispatchNodeResponseKeyEnum.nodeResponse]: {
textOutput: formatText
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import Papa from 'papaparse';
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { readFileRawText } from './rawText';

// 加载源文件内容
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
export const readCsvRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText } = readFileRawText(params);

const csvArr = Papa.parse(rawText).data as string[][];
Expand Down
23 changes: 23 additions & 0 deletions packages/service/worker/file/extension/docx.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import mammoth from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import { html2md } from '../../htmlStr2Md/utils';

/**
* read docx to markdown
*/
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
try {
const { value: html } = await mammoth.convertToHtml({
buffer
});

const rawText = html2md(html);

return {
rawText
};
} catch (error) {
console.log('error doc read:', error);
return Promise.reject('Can not read doc file, please convert to PDF');
}
};
Loading

0 comments on commit b5f0ac3

Please sign in to comment.