当前位置:网站首页>OCR识别PDF文件
OCR识别PDF文件
2022-04-23 02:39:00 【一只努力xx的程序媛】
1现有解析pdf的方法
使用org.apache.pdfbox读取pdf,只能读取pdf中的文字,有些纸件扫描成的pdf文字会错乱,有些字还是图片的方式显示的,导致读取的内容不全,常常会获取不到想要的数据。
2 OCR文字识别
pdf需要转换为图片,进行识别,识别率高。
2.1 调用百度接口
优点:识别率高,识别速度快
缺点:按次收费
2.2 使用开源工具读取pdf文档
2.2.1 下载工具包
https://github.com/tesseract-ocr/tessdata 下载chi_sim.traineddata,chi_sim_vert.traineddata
2.2.2 添加依赖
<dependencies>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.4.0</version>
</dependency>
</dependencies>
2.2.3 编写程序
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class TestTess {
public static void main(String[] args) {
// 识别图片的路径(修改为自己的图片路径)
String path = "C:\\work\\notebook\\prototype\\target\\qq1.jpg";
// 语言库位置(修改为跟自己语言库文件夹的路径)
// String lagnguagePath = "C:\\work\\projects\\tess\\resources\\tessdata";
File file = new File(path);
ITesseract instance = new Tesseract();
//设置训练库的位置
//instance.setDatapath(lagnguagePath);
//chi_sim :简体中文, eng 根据需求选择语言库
instance.setLanguage("chi_sim");
String result = null;
try {
long startTime = System.currentTimeMillis();
result = instance.doOCR(file);
long endTime = System.currentTimeMillis();
System.out.println("Time is:" + (endTime - startTime) + " 毫秒");
} catch (TesseractException e) {
e.printStackTrace();
}
System.out.println("result: ");
System.out.println(result);
}
}
2.3 读取pdf的特定位置的数据
2.3.1 手动截取pdf指定矩形区域的材料
需要前端配合做页面,改成自动获取,这样增加文档类型时,需要配置一下。
2.3.2 Python程序获取图片标识区域在整个文档中的矩形坐标
import aircv
def matchImg(imgsrc, imgobj, confidence=0.2):
""" 图片对比识别imgobj在imgsrc上的相对位置(批量识别统一图片中需要的部分) :param imgsrc: 原始图片路径(str) :param imgobj: 待查找图片路径(模板)(str) :param confidence: 识别度(0<confidence<1.0) :return: None or dict({'confidence': 相似度(float), 'rectangle': 原始图片上的矩形坐标(tuple), 'result': 中心坐标(tuple)}) """
imsrc = aircv.imread(imgsrc)
imobj = aircv.imread(imgobj)
match_result = aircv.find_template(imsrc, imobj,
confidence) # {'confidence': 0.5435812473297119, 'rectangle': ((394, 384), (394, 416), (450, 384), (450, 416)), 'result': (422.0, 400.0)}
if match_result is not None:
match_result['shape'] = (imsrc.shape[1], imsrc.shape[0]) # 0为高,1为宽
return match_result
template = {
'address':'dz.jpg','doc_num':'fw.jpg','doc_type':'fwlx.jpg','issue_date':'fwrq.jpg',
'int_cls': 'splb.jpg','apply_num':'sqh.jpg','applyer':'sqr.jpg','content':'zw.jpg'}
for key, value in template.items():
orig = matchImg("target/qq.jpg","target/"+value)
rect = orig['rectangle']
w = rect[3][0] - rect[0][0]
h = rect[3][1] - rect[0][1]
x = rect[0][0]
y = rect[0][1]
ret = [x,y,w,h]
print(key,ret)
2.3.3 Java程序根据矩形区域坐标,获取指定位置信息
package odysssey.tess;
import java.io.File;
import java.io.IOException;
import java.util.List;
import javax.imageio.ImageIO;
import java.awt.Rectangle;
import net.sourceforge.tess4j.ITessAPI.TessPageIteratorLevel;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class TestTess {
public static void main(String[] args) throws IOException {
// 识别图片的路径(修改为自己的图片路径)
String path = "C:\\work\\notebook\\prototype\\target\\qq.jpg";
// 语言库位置(修改为跟自己语言库文件夹的路径)
// String lagnguagePath = "C:\\work\\projects\\tess\\resources\\tessdata";
File file = new File(path);
ITesseract instance = new Tesseract();
//设置训练库的位置
/* address [61, 312, 734, 82] doc_num [1002, 338, 527, 78] doc_type [425, 736, 801, 82] issue_date [64, 593, 495, 64] int_cls [115, 969, 346, 82] apply_num [676, 589, 388, 68] applyer [72, 450, 481, 68] content [107, 899, 1439, 70] */
int rects[][] = {
{
61, 312, 734, 82},
{
1002, 338, 527, 78},
{
425, 736, 801, 82},
{
64, 593, 495, 64},
{
115, 969, 346, 82},
{
676, 589, 388, 68},
{
72, 450, 481, 68},
{
107, 899, 1439, 70}};
//chi_sim :简体中文, eng 根据需求选择语言库
instance.setLanguage("chi_sim");
instance.setTessVariable("user_defined_dpi", "96");
String result = null;
try {
long startTime = System.currentTimeMillis();
//result = instance.doOCR(file);
for(int i = 0 ;i < rects.length;i++){
Rectangle rr = new Rectangle(rects[i][0],rects[i][1],rects[i][2],rects[i][3]);
result =instance.doOCR(file, rr);
System.out.println(result);
}
/* List<Rectangle> resul=instance.getSegmentedRegions(ImageIO.read(file), TessPageIteratorLevel.RIL_SYMBOL); for (int i = 0; i < resul.size(); i++) { Rectangle rect = resul.get(i); System.out.println(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d", i, rect.x, rect.y, rect.width, rect.height)); } */ long endTime = System.currentTimeMillis();
System.out.println("Time is:" + (endTime - startTime) + " 毫秒");
} catch (TesseractException e) {
e.printStackTrace();
}
}
}
版权声明
本文为[一只努力xx的程序媛]所创,转载请带上原文链接,感谢
https://blog.csdn.net/qq_23888451/article/details/104803782
边栏推荐
- New book recommendation - IPv6 technology and application (Ruijie version)
- Talk about current limiting
- Go语言web中间件的使用
- Suggestion: block reference sorting is in the order of keywords
- MySQL JDBC programming
- Rich intelligent auxiliary functions and exposure of Sihao X6 security configuration: it will be pre sold on April 23
- php+mysql對下拉框搜索的內容修改
- Target narak
- Global, exclusive and local routing guard
- Day 3 of learning rhcsa
猜你喜欢
随机推荐
电源电路设计原来是这么回事
[wechat applet] set the bottom menu (tabbar) for the applet
IAR嵌入式開發STM32f103c8t6之點亮LED燈
小程序 读取文件
Flink stream processing engine system learning (I)
双亲委派模型【理解】
TypeScript(1)
Rich intelligent auxiliary functions and exposure of Sihao X6 security configuration: it will be pre sold on April 23
[XJTU computer network security and management] Lecture 2 password technology
Using go language to build web server
How to solve the complexity of project document management?
Renesas electronic MCU RT thread development and Design Competition
tp6阿裏雲短信 window 報 cURL error 60: SSL certificate problem: unable to get local issuer certificate
PIP install shutil reports an error
[suggestion collection] hematemesis sorting out golang interview dry goods 21 questions - hanging interviewer-1
How to prevent leakage of operation and maintenance data
每日一题冲刺大厂第十六天 NOIP普及组 三国游戏
程序设计天梯赛 L1-49 天梯赛分配座位(模拟),布响丸辣
A domestic image segmentation project is heavy and open source!
【2019-CVPR-3D人体姿态估计】Fast and Robust Multi-Person 3D Pose Estimation from Multiple Views