以下是我的textractUtils.js代码 –
const _ = require("lodash");const aws = require("aws-sdk");const config = require("./config");aws.config.update({ accessKeyId: config.awsAccesskeyID, secretAccessKey: config.awsSecretAccessKey, region: config.awsRegion});const textract = new aws.Textract();const getText = (result, blocksMap) => { let text = ""; if (_.has(result, "Relationships")) { result.Relationships.forEach(relationship => { if (relationship.Type === "CHILD") { relationship.Ids.forEach(childId => { const word = blocksMap[childId]; if (word.BlockType === "WORD") { text += `${word.Text} `; } if (word.BlockType === "SELECTION_ELEMENT") { if (word.SelectionStatus === "SELECTED") { text += `X `; } } }); } }); } return text.trim();};const findValueBlock = (keyBlock, valueMap) => { let valueBlock; keyBlock.Relationships.forEach(relationship => { if (relationship.Type === "VALUE") { // eslint-disable-next-line array-callback-return relationship.Ids.every(valueId => { if (_.has(valueMap, valueId)) { valueBlock = valueMap[valueId]; return false; } }); } }); return valueBlock;};const getKeyValueRelationship = (keyMap, valueMap, blockMap) => { const keyValues = {}; const keyMapValues = _.values(keyMap); keyMapValues.forEach(keyMapValue => { const valueBlock = findValueBlock(keyMapValue, valueMap); const key = getText(keyMapValue, blockMap); const value = getText(valueBlock, blockMap); keyValues[key] = value; }); return keyValues;};const getKeyValueMap = blocks => { const keyMap = {}; const valueMap = {}; const blockMap = {}; let blockId; blocks.forEach(block => { blockId = block.Id; blockMap[blockId] = block; if (block.BlockType === "KEY_VALUE_SET") { if (_.includes(block.EntityTypes, "KEY")) { keyMap[blockId] = block; } else { valueMap[blockId] = block; } } }); return { keyMap, valueMap, blockMap };};module.exports = async buffer => { const params = { Document: { /* required */ Bytes: buffer }, FeatureTypes: ["FORMS"] }; const request = textract.analyzeDocument(params); const data = await request.promise(); if (data && data.Blocks) { const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks); const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap); return keyValues; } // 如果没有找到块,则返回undefined return undefined;};
它在处理图片时运行正常,但处理pdf时(无论是单页还是多页)都不行。以下是我导入pdf时遇到的错误 –
(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27) at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20) at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10) at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14) at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10) at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12) at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10 at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9) at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12) at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
我尝试过处理不含文本的图片、含文本的图片、含表格的图片、单页pdf和多页pdf。我还有一个概念上的疑问,如果我已经导入了aws-sdk,为什么我还需要为pdf编写代码,因为aws-sdk的textract应该会处理pdf、png、jpeg和jpg格式的图片?我需要对textractUtils.js做哪些修改才能处理pdf文件?
回答:
AnalyzeDocument
API操作只支持PNG或JPEG格式的图片。根据Textract的文档:
Amazon Textract同步操作(
DetectDocumentText
和AnalyzeDocument
)支持PNG和JPEG图像格式。异步操作(StartDocumentTextDetection
、StartDocumentAnalysis
)也支持PDF文件格式。
您应该使用异步操作来处理您的PDF文档。否则,一个解决方法是在您的代码中将PDF文档转换为图片,然后使用这些图片的同步API操作来处理文档。