Skip to content

Commit

Permalink
perf: export dataset data
Browse files Browse the repository at this point in the history
  • Loading branch information
c121914yu committed Sep 18, 2023
1 parent 07f645a commit 022889d
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 104 deletions.
3 changes: 3 additions & 0 deletions client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"crypto": "^1.0.1",
"date-fns": "^2.30.0",
"dayjs": "^1.11.7",
"downloadjs": "^1.4.7",
"echarts": "^5.4.1",
"echarts-gl": "^2.0.9",
"formidable": "^2.1.1",
Expand All @@ -48,6 +49,7 @@
"openai": "^3.3.0",
"papaparse": "^5.4.1",
"pg": "^8.10.0",
"pg-query-stream": "^4.5.3",
"react": "18.2.0",
"react-day-picker": "^8.7.1",
"react-dom": "18.2.0",
Expand All @@ -71,6 +73,7 @@
"devDependencies": {
"@svgr/webpack": "^6.5.1",
"@types/cookie": "^0.5.1",
"@types/downloadjs": "^1.4.3",
"@types/formidable": "^2.0.5",
"@types/js-cookie": "^3.0.3",
"@types/jsdom": "^21.1.1",
Expand Down
101 changes: 56 additions & 45 deletions client/pnpm-lock.yaml

Large diffs are not rendered by default.

23 changes: 18 additions & 5 deletions client/src/api/plugins/kb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import { Props as UpdateDataProps } from '@/pages/api/openapi/kb/updateData';
import type { KbUpdateParams, CreateKbParams, GetKbDataListProps } from '../request/kb';
import { QuoteItemType } from '@/types/chat';
import { KbTypeEnum } from '@/constants/kb';
import { getToken } from '@/utils/user';
import download from 'downloadjs';

/* knowledge base */
export const getKbList = (data: { parentId?: string; type?: `${KbTypeEnum}` }) =>
Expand All @@ -35,12 +37,23 @@ export const getKbDataList = (data: GetKbDataListProps) =>
POST(`/plugins/kb/data/getDataList`, data);

/**
* 获取导出数据(不分页)
* export and download data
*/
export const getExportDataList = (data: { kbId: string }) =>
GET<[string, string, string][]>(`/plugins/kb/data/exportModelData`, data, {
timeout: 600000
});
export const exportDataset = (data: { kbId: string }) =>
fetch(`/api/plugins/kb/data/exportAll?kbId=${data.kbId}`, {
method: 'GET',
headers: {
token: getToken()
}
})
.then(async (res) => {
if (!res.ok) {
const data = await res.json();
throw new Error(data?.message || 'Export failed');
}
return res.blob();
})
.then((blob) => download(blob, 'dataset.csv', 'text/csv'));

/**
* 获取模型正在拆分数据的数量
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, User } from '@/service/mongo';
import { authUser } from '@/service/utils/auth';
import { PgClient } from '@/service/pg';
import { PgDatasetTableName } from '@/constants/plugin';
import { findAllChildrenIds } from '../delete';
import QueryStream from 'pg-query-stream';
import Papa from 'papaparse';

export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
let { kbId } = req.query as {
kbId: string;
};

if (!kbId) {
if (!kbId || !global.pgClient) {
throw new Error('缺少参数');
}

Expand All @@ -22,7 +23,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const { userId } = await authUser({ req, authToken: true });

const exportIds = [kbId, ...(await findAllChildrenIds(kbId))];
console.log(exportIds);

const thirtyMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
Expand All @@ -45,37 +45,50 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
throw new Error(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}

const where: any = [
['user_id', userId],
'AND',
`kb_id IN (${exportIds.map((id) => `'${id}'`).join(',')})`
];
// 从 pg 中获取所有数据
const pgData = await PgClient.select<{ q: string; a: string; source: string }>(
PgDatasetTableName,
{
where,
fields: ['q', 'a', 'source'],
order: [{ field: 'id', mode: 'DESC' }],
limit: 1000000
// connect pg
global.pgClient.connect((err, client, done) => {
if (err) {
console.error(err);
res.end('Error connecting to database');
return;
}
);
// create pg select stream
const query = new QueryStream(
`SELECT q, a, source FROM ${PgDatasetTableName} where user_id='${userId}' AND kb_id IN (${exportIds
.map((id) => `'${id}'`)
.join(',')})`
);
const stream = client.query(query);

const data: [string, string, string][] = pgData.rows.map((item) => [
item.q.replace(/\n/g, '\\n'),
item.a.replace(/\n/g, '\\n'),
item.source
]);
res.setHeader('Content-Disposition', 'attachment; filename=dataset.csv');
res.setHeader('Content-Type', 'text/csv');

// update export time
await User.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
res.write('index,content,source');

jsonRes(res, {
data
// parse data every row
stream.on('data', (row: { q: string; a: string; source?: string }) => {
const csv = Papa.unparse([row], { header: false });
res.write(`\n${csv}`);
});
stream.on('end', async () => {
try {
// update export time
await User.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
} catch (error) {}

// close response
done();
res.end();
});
stream.on('error', (err) => {
done(err);
res.end('Error exporting data');
});
});
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
Expand Down
2 changes: 1 addition & 1 deletion client/src/pages/kb/detail/components/Import/Csv.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ const CsvImport = ({ kbId }: { kbId: string }) => {
/>
</Flex>
<Box px={4} fontSize={'sm'} whiteSpace={'pre-wrap'} wordBreak={'break-all'}>
{`q: ${item.q}\na: ${item.a}`}
{`${item.q}\n${item.a}`}
</Box>
</Box>
))
Expand Down
20 changes: 11 additions & 9 deletions client/src/pages/kb/detail/components/Import/FileSelect.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const UrlFetchModal = dynamic(() => import('./UrlFetchModal'));
const CreateFileModal = dynamic(() => import('./CreateFileModal'));

const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12);
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
const csvTemplate = `index,content,source\n"被索引的内容","对应的答案。CSV 中请注意内容不能包含双引号,双引号是列分割符号","来源,可选。"\n"什么是 laf","laf 是一个云函数开发平台……",""\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……",""`;

export type FileItemType = {
id: string;
Expand Down Expand Up @@ -149,21 +149,23 @@ const FileSelect = ({
/* csv file */
if (extension === 'csv') {
const { header, data } = await readCsvContent(file);
if (header[0] !== 'question' || header[1] !== 'answer') {
throw new Error('csv 文件格式有误,请确保 questionanswer 两列');
if (header[0] !== 'index' || header[1] !== 'content') {
throw new Error('csv 文件格式有误,请确保 indexcontent 两列');
}
const fileItem: FileItemType = {
id: filesId[0],
filename: file.name,
icon,
tokens: 0,
text: '',
chunks: data.map((item) => ({
q: item[0],
a: item[1],
source: item[2] || file.name,
file_id: filesId[0]
}))
chunks: data
.filter((item) => item[0])
.map((item) => ({
q: item[0] || '',
a: item[1] || '',
source: item[2] || file.name || '',
file_id: filesId[0]
}))
};

chunkFiles.unshift(fileItem);
Expand Down
18 changes: 2 additions & 16 deletions client/src/pages/kb/list/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import PageContainer from '@/components/PageContainer';
import { useConfirm } from '@/hooks/useConfirm';
import { AddIcon } from '@chakra-ui/icons';
import { useQuery } from '@tanstack/react-query';
import { delKbById, getExportDataList, getKbPaths, putKbById } from '@/api/plugins/kb';
import { delKbById, exportDataset, getKbPaths, putKbById } from '@/api/plugins/kb';
import { useTranslation } from 'react-i18next';
import Avatar from '@/components/Avatar';
import MyIcon from '@/components/Icon';
Expand All @@ -27,8 +27,6 @@ import MyMenu from '@/components/MyMenu';
import { useRequest } from '@/hooks/useRequest';
import { useGlobalStore } from '@/store/global';
import { useEditTitle } from '@/hooks/useEditTitle';
import Papa from 'papaparse';
import { fileDownload } from '@/utils/file';
import { feConfigs } from '@/store/static';

const CreateModal = dynamic(() => import('./component/CreateModal'), { ssr: false });
Expand Down Expand Up @@ -90,19 +88,7 @@ const Kb = () => {
const { mutate: onclickExport } = useRequest({
mutationFn: (kbId: string) => {
setLoading(true);
return getExportDataList({ kbId });
},
onSuccess(res) {
const text = Papa.unparse({
fields: ['question', 'answer', 'source'],
data: res
});

fileDownload({
text,
type: 'text/csv',
filename: 'dataset.csv'
});
return exportDataset({ kbId });
},
onSettled() {
setLoading(false);
Expand Down

0 comments on commit 022889d

Please sign in to comment.