第一步:导入jar包
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
<dependency>
<groupId>com.baidu.aip</groupId>
<artifactId>java-sdk</artifactId>
<version>4.8.0</version>
</dependency>
第二步:提取pdf中的图片
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import javax.imageio.ImageIO;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
public class Test2 {
public static void main(String[] args) {
String file = "/Users/jin/Downloads/xxxxx.pdf";
String path = "/Users/jin/Downloads/img/";
try {
// 打开pdf文件流
FileInputStream fis = new FileInputStream(file);
// 加载 pdf 文档,获取PDDocument文档对象
PDDocument document = PDDocument.load(fis);
// 获取PDDocumentCatalog文档目录对象
PDDocumentCatalog catalog = document.getDocumentCatalog();
// 获取文档页面PDPage列表
int pages = document.getNumberOfPages();
int count = 1;
for (int j = 1; j < pages; j++) {
PDPage page = document.getPage(j);
PDResources resources = page.getResources();
Iterable xobjects = resources.getXObjectNames();
if (xobjects != null) {
Iterator imageIter = xobjects.iterator();
while (imageIter.hasNext()) {
COSName key = (COSName) imageIter.next();
if (resources.isImageXObject(key) && (!key.getName().equals("QuickPDFIm848de7a9"))) {
try {
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
BufferedImage bimage = image.getImage();
ImageIO.write(bimage, "jpg", new File(path + count + ".jpg"));
count++;
System.out.println(count);
} catch (Exception e) {
}
}
}
}
}
} catch (Exception e) {
System.out.println();
}
}
}
第三步:把图片转换成文本保存(这里使用的是百度文字识别)
import java.util.HashMap;
import org.json.JSONArray;
import org.json.JSONObject;
import com.baidu.aip.ocr.AipOcr;
import com.jin.demo.util.FileUtil;
public class Sample {
// 设置APPID/AK/SK
public static final String APP_ID = "xxx";
public static final String API_KEY = "xxx";
public static final String SECRET_KEY = "xxx";
public static void main(String[] args) {
// 初始化一个AipOcr
AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);
// 可选:设置网络连接参数
client.setConnectionTimeoutInMillis(2000);
client.setSocketTimeoutInMillis(60000);
// 也可以直接通过jvm启动参数设置此环境变量
System.setProperty("aip.log4j.conf", "path/to/your/log4j.properties");
String filePath = "/Users/jin/Desktop/book/xxx.txt";
System.out.println("---begin---");
// 调用接口
for (int i = 4; i < 222; i++) {
System.out.println("---" + i + "---");
String path = "/Users/jin/Downloads/img/" + i + ".jpg";
JSONObject res = client.basicGeneral(path, new HashMap<String, String>());
JSONArray jsonArray = res.getJSONArray("words_result");
for (int j = 0; j < jsonArray.length(); j++) {
JSONObject jsonObject = jsonArray.getJSONObject(j);
String content = jsonObject.getString("words");
addContent(filePath, content);
}
}
System.out.println("---over---");
}
public static void addContent(String path, String content) {
FileWriter fw = null;
try {
// 如果文件存在,则追加内容;如果文件不存在,则创建文件
File f = new File(path);
fw = new FileWriter(f, true);
} catch (IOException e) {
e.printStackTrace();
}
PrintWriter pw = new PrintWriter(fw);
pw.println(content);
pw.flush();
try {
fw.flush();
pw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
网友评论