java pdf文件转图片并进行ocr识别保存文字
【代码】java pdf文件转图片并进行ocr识别保存文字。
·
package org.me.swing;
import com.baidu.aip.ocr.AipOcr;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.json.JSONArray;
import org.json.JSONObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
// <dependency>
// <groupId>com.baidu.aip</groupId>
// <artifactId>java-sdk</artifactId>
// <version>4.8.0</version>
// </dependency>
// <dependency>
// <groupId>org.apache.pdfbox</groupId>
// <artifactId>pdfbox</artifactId>
// <version>3.0.1</version>
// </dependency>
public class Pdf2ImgOcr {
// 百度OCR客户端
private static AipOcr client;
// OCR配置参数
private static final HashMap<String, String> OCR_OPTIONS = new HashMap<>() {{
put("language_type", "CHN_ENG");
put("detect_direction", "true");
put("detect_language", "true");
put("probability", "true");
}};
// 初始化OCR客户端
static {
String appId = "你的 App ID";
String apiKey = "你的 Api Key";
String secretKey = "你的 Secret Key";
client = new AipOcr(appId, apiKey, secretKey);
// 可选:设置网络参数
client.setConnectionTimeoutInMillis(60000);
client.setSocketTimeoutInMillis(60000);
}
public static void main(String[] args) {
String filePath = "D:\\03tmp\\zz权力操作要论(扫描).pdf";
try {
processPdfFile(filePath);
} catch (IOException e) {
System.err.println("处理PDF文件时发生错误: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
private static void processPdfFile(String filePath) throws IOException {
File file = new File(filePath);
if (!file.exists()) {
throw new IOException("文件不存在: " + filePath);
}
String outputTextPath = file.getParent() + "/" + getBaseName(file.getName()) + ".txt";
String imageOutputDir = file.getParent() + "/images/";
// 创建图片输出目录
new File(imageOutputDir).mkdirs();
try (PDDocument doc = Loader.loadPDF(file);
BufferedWriter writer = new BufferedWriter(new FileWriter(outputTextPath))) {
PDFRenderer renderer = new PDFRenderer(doc);
int totalPages = doc.getNumberOfPages();
System.out.println("开始处理PDF文件: " + file.getName());
System.out.println("总页数: " + totalPages);
System.out.println("输出文本文件: " + outputTextPath);
for (int i = 0; i < totalPages; i++) {
System.out.printf("正在处理第 %d/%d 页...\n", i + 1, totalPages);
// 渲染PDF页面为图片
BufferedImage image = renderer.renderImageWithDPI(i, 300);
Path imagePath = Paths.get(imageOutputDir + String.format("page_%03d.jpg", i + 1));
ImageIO.write(image, "jpg", Files.newOutputStream(imagePath));
// // 执行OCR识别
// String text = imgOcr(imagePath.toAbsolutePath().toString());
//
// // 写入识别结果
// writer.write(String.format("\n=== 第 %d 页 ===\n", i + 1));
// writer.write(text);
// writer.flush();
}
System.out.println("PDF处理完成!");
}
}
private static String imgOcr(String imgPath) {
try {
// 传入可选参数调用接口
JSONObject res = client.basicGeneral(imgPath, OCR_OPTIONS);
if (!res.has("words_result")) {
System.err.println("OCR识别失败: " + res.toString());
return "";
}
JSONArray wordsResult = res.getJSONArray("words_result");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wordsResult.length(); i++) {
JSONObject jo = wordsResult.getJSONObject(i);
sb.append(jo.getString("words")).append("\n");
}
return sb.toString();
} catch (Exception e) {
System.err.println("OCR处理图片时发生错误: " + imgPath);
e.printStackTrace();
return "";
}
}
private static String getBaseName(String fileName) {
int dotIndex = fileName.lastIndexOf('.');
return (dotIndex == -1) ? fileName : fileName.substring(0, dotIndex);
}
}
更多推荐
所有评论(0)