JAVA使用Tabula解析PDF表格

2025-01-21 约 1236 字预计阅读 3 分钟

https://bing.ee123.net/img/rand?artid=118549366

JAVA使用Tabula解析PDF表格

一开始使用pdfbox解析pdf,发现解析出来的内容全都错位了,无法区分哪个内容在哪个单元格内!

后来翻阅资料可以使用Tabula来解析pdf文件内的表格，不过底层还是用的pdfbox写的。

地址：

先引入依赖：

<dependency>
	<groupId>technology.tabula</groupId>
	<artifactId>tabula</artifactId>
	<version>1.0.3</version>
		<exclusions>
			<exclusion>
				<artifactId>slf4j-simple</artifactId>
				<groupId>org.slf4j</groupId>
			</exclusion>
		</exclusions>
</dependency>

使用方式：

public class TestPdfTabula {

    public static void main(String[] args) throws ParseException {
        //-f导出格式,默认CSV  (一定要大写)
        //-p 指导出哪页,all是所有
        //path　D:\\1xx.pdf 
        //-l 强制使用点阵模式提取PDF　（关键在于这儿）
        String[] argsa = new String[]{"-f=JSON","-p=all", "D:\\1xx.pdf","-l"};
        //CommandLineApp.main(argsa);
        CommandLineParser parser = new DefaultParser();
        CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), argsa);
        StringBuilder stringBuilder = new StringBuilder();
        new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
        System.out.println("打印返回数据:  "+stringBuilder.toString());
    }
}

解析的JSON部分数据：

[
    {
        "extraction_method":"lattice",
        "top":150.05539,
        "left":51.102352,
        "width":493.0645751953125,
        "height":616.2757568359375,
        "right":544.16693,
        "bottom":766.3312,
        "data":[
            [
                {
                    "top":150.05539,
                    "left":51.102352,
                    "width":29.147647857666016,
                    "height":36.584625244140625,
                    "text":"序\r号"
                },
                {
                    "top":150.05539,
                    "left":80.25,
                    "width":56.600006103515625,
                    "height":36.584625244140625,
                    "text":"编号"
                },
                {
                    "top":150.05539,
                    "left":136.85,
                    "width":81.75001525878906,
                    "height":36.584625244140625,
                    "text":"坐落"
                },
                {
                    "top":150.05539,
                    "left":218.60002,
                    "width":41.04997253417969,
                    "height":36.584625244140625,
                    "text":"面积"
                },
                {
                    "top":150.05539,
                    "left":259.65,
                    "width":48,
                    "height":36.584625244140625,
                    "text":"类型"
                },
                {
                    "top":150.05539,
                    "left":307.65,
                    "width":52.95001220703125,
                    "height":36.584625244140625,
                    "text":"方式"
                },
                {
                    "top":150.05539,
                    "left":360.6,
                    "width":47.29998779296875,
                    "height":36.584625244140625,
                    "text":"时间"
                },
                {
                    "top":150.05539,
                    "left":407.9,
                    "width":42.350006103515625,
                    "height":36.584625244140625,
                    "text":"推进主\r体"
                },
                {
                    "top":150.05539,
                    "left":450.25,
                    "width":93.91693115234375,
                    "height":36.584625244140625,
                    "text":"备注"
                }
            ],
            [
                {
                    "top":243.14001,
                    "left":51.102352,
                    "width":29.147647857666016,
                    "height":56.5,
                    "text":"2"
                },
                {
                    "top":243.14001,
                    "left":80.25,
                    "width":56.600006103515625,
                    "height":56.5,
                    "text":"xx地\r块"
                },
                {
                    "top":243.14001,
                    "left":136.85,
                    "width":81.75001525878906,
                    "height":56.5,
                    "text":"xx地址"
                },
                {
                    "top":243.14001,
                    "left":218.60002,
                    "width":41.04997253417969,
                    "height":56.5,
                    "text":"38200"
                },
                {
                    "top":243.14001,
                    "left":259.65,
                    "width":48,
                    "height":56.5,
                    "text":"保障性\r租赁住\r宅用地"
                },
                {
                    "top":243.14001,
                    "left":307.65,
                    "width":52.95001220703125,
                    "height":56.5,
                    "text":"协议出让"
                },
                {
                    "top":243.14001,
                    "left":360.6,
                    "width":47.29998779296875,
                    "height":56.5,
                    "text":"2021 年\r3 月"
                },
                {
                    "top":243.14001,
                    "left":407.9,
                    "width":42.350006103515625,
                    "height":56.5,
                    "text":"某\r政府"
                },
                {
                    "top":243.14001,
                    "left":450.25,
                    "width":93.91693115234375,
                    "height":56.5,
                    "text":""
                }
            ]
        ]
    }
]

解析JSON数据：

//解析tabula读取pdf表格,将返回的数据转成jsonArray
JSONArray jsonArray = new JSONArray();
jsonArray.add(stringBuilder.toString());
List<ResidentialLand> listInfo = new ArrayList<>();
for (int i = 0; i <jsonArray.size() ; i++) {
    //获取每个页
    JSONArray jsonPage = jsonArray.getJSONArray(i);
    //遍历页
    for (int j = 0; j < jsonPage.size(); j++) {
    //获取每页中的data
    JSONArray dataArr = jsonPage.getJSONObject(j).getJSONArray("data");
    //遍历data中的每个单元格
    for (int k = 0; k < dataArr.size() ; k++) {
        //遍历data中的每一条,也就是单元格中的每一行
        JSONArray dataD = dataArr.getJSONArray(k);
        String xuhao = dataD.getJSONObject(0).get("text").toString().replaceAll("\r","");
        //如果第1个单元格的数据是序号,则跳出个这个循环,因为不能将标题存入表中
        if(xuhao.contains("序号")){
        continue;
        }
        ResidentialLand info = new ResidentialLand();
        info.setId(UUIDUtils.get36UUID());
        info.setDelFlag("0");
        //通过下标获取每个单元格的数据,下标是固定的 ,最多有9个单元格
　　	info.setNum(dataD.getJSONObject(0).get("text").toString().replaceAll("\r",""));
　　    info.setParcelCode(dataD.getJSONObject(1).get("text").toString().replaceAll("\r",""));
        info.setPosition(dataD.getJSONObject(2).get("text").toString().replaceAll("\r",""));
        info.setMeasure(dataD.getJSONObject(3).get("text").toString().replaceAll("\r",""));
        info.setType(dataD.getJSONObject(4).get("text").toString().replaceAll("\r",""));
        info.setSupplyWay(dataD.getJSONObject(5).get("text").toString().replaceAll("\r",""));
 　	info.setSupplyTime(dataD.getJSONObject(6).get("text").toString().replaceAll("\r","").replaceAll(" ",""));
        info.setPushOnMain(dataD.getJSONObject(7).get("text").toString().replaceAll("\r",""));
        info.setRemark(dataD.getJSONObject(8).get("text").toString().replaceAll("\r",""));
        listInfo.add(info);
    }
    }
}
int a = 0;
for (int i = 0; i < listInfo.size(); i++) {
    //判断序号如果是空,则将序号空的数据与上一条数据合并
    if("".equals(listInfo.get(i).getNum()) || listInfo.get(i).getNum() == null){
    a++;
    listInfo.get(i-a).setParcelCode(listInfo.get(i-a).getParcelCode()+listInfo.get(i).getParcelCode());
    listInfo.get(i-a).setPosition(listInfo.get(i-a).getPosition()+listInfo.get(i).getPosition());
    listInfo.get(i-a).setMeasure(listInfo.get(i-a).getMeasure()+listInfo.get(i).getMeasure());
    listInfo.get(i-a).setType(listInfo.get(i-a).getType()+listInfo.get(i).getType());
    listInfo.get(i-a).setSupplyWay(listInfo.get(i-a).getSupplyWay()+listInfo.get(i).getSupplyWay());
    listInfo.get(i-a).setSupplyTime(listInfo.get(i-a).getSupplyTime()+listInfo.get(i).getSupplyTime());
    listInfo.get(i-a).setPushOnMain(listInfo.get(i-a).getPushOnMain()+listInfo.get(i).getPushOnMain());
    listInfo.get(i-a).setRemark(listInfo.get(i-a).getRemark()+listInfo.get(i).getRemark());
    a--;
    }
}
for (int i = 0; i < listInfo.size(); i++) {
    //遍历删除序号是空的数据
    if("".equals(listInfo.get(i).getNum()) || listInfo.get(i).getNum() == null){
    listInfo.remove(listInfo.get(i));
    }
}

有需要的小伙伴可以参考下。

目录

JAVA使用Tabula解析PDF表格

JAVA使用Tabula解析PDF表格