langchain 文件分割

安装langchain包

  npm i langchain

代码示例

 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
 const { TextLoader } = require("langchain/document_loaders/fs/text");
 const { DocxLoader } = require("langchain/document_loaders/fs/docx");
 const { EPubLoader } = require("langchain/document_loaders/fs/epub");
    documentClassification(filepath, mimeType) {
        if (!filepath) throw new Error(`路径出现问题:${filepath}`)
        let loader = null;
        switch (mimeType) {
            case 'pdf':
                loader = new PDFLoader(filepath)
                break; 
            case 'epub':
                loader = new EPubLoader(filepath)
                break;
            case 'txt':
                loader = new TextLoader(filepath)
                break;
            case 'docx':
                loader = new DocxLoader(filepath)
                break;
            default:
                break;
        }
        if (!loader) throw new Error(`无法解析的类型:${mimeType}`)
        return loader;
    }
    //文档读取
     /**
     * 上传资料分割 
     * @param {String}  filepath 文件路径
     * @param {String}  filename 文件名称
     * @returns {Object} 上传状态
     */
 async documentReading( { filepath, filename, }) { 
       let mimeType = filepath.split('.')[1];
        console.log('文件类型', mimeType)
        //初始化分割文档分割工具
        const splitter = new RecursiveCharacterTextSplitter({
            chunkSize: 1000  //分割大小
        });
        // 加载
        const loader = documentClassification(filepath, mimeType);
        const docs = await loader.load(); 
        // 分割
        const splitterDocs = await splitter.splitDocuments(docs);
        console.log( '文档分割完成');
        return { splitterDocs, useTokens: splitterDocs.length * 1000 }
    }
     module.exports = documentReading
Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐