JAVA使用Tabula解析PDF表格
目录
JAVA使用Tabula解析PDF表格
一开始使用pdfbox解析pdf,发现解析出来的内容全都错位了,无法区分哪个内容在哪个单元格内!
后来翻阅资料可以使用Tabula来解析pdf文件内的表格,不过底层还是用的pdfbox写的。
地址:
先引入依赖:
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
使用方式:
public class TestPdfTabula {
public static void main(String[] args) throws ParseException {
//-f导出格式,默认CSV (一定要大写)
//-p 指导出哪页,all是所有
//path D:\\1xx.pdf
//-l 强制使用点阵模式提取PDF (关键在于这儿)
String[] argsa = new String[]{"-f=JSON","-p=all", "D:\\1xx.pdf","-l"};
//CommandLineApp.main(argsa);
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), argsa);
StringBuilder stringBuilder = new StringBuilder();
new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
System.out.println("打印返回数据: "+stringBuilder.toString());
}
}
解析的JSON部分数据:
[
{
"extraction_method":"lattice",
"top":150.05539,
"left":51.102352,
"width":493.0645751953125,
"height":616.2757568359375,
"right":544.16693,
"bottom":766.3312,
"data":[
[
{
"top":150.05539,
"left":51.102352,
"width":29.147647857666016,
"height":36.584625244140625,
"text":"序\r号"
},
{
"top":150.05539,
"left":80.25,
"width":56.600006103515625,
"height":36.584625244140625,
"text":"编号"
},
{
"top":150.05539,
"left":136.85,
"width":81.75001525878906,
"height":36.584625244140625,
"text":"坐落"
},
{
"top":150.05539,
"left":218.60002,
"width":41.04997253417969,
"height":36.584625244140625,
"text":"面积"
},
{
"top":150.05539,
"left":259.65,
"width":48,
"height":36.584625244140625,
"text":"类型"
},
{
"top":150.05539,
"left":307.65,
"width":52.95001220703125,
"height":36.584625244140625,
"text":"方式"
},
{
"top":150.05539,
"left":360.6,
"width":47.29998779296875,
"height":36.584625244140625,
"text":"时间"
},
{
"top":150.05539,
"left":407.9,
"width":42.350006103515625,
"height":36.584625244140625,
"text":"推进主\r体"
},
{
"top":150.05539,
"left":450.25,
"width":93.91693115234375,
"height":36.584625244140625,
"text":"备注"
}
],
[
{
"top":243.14001,
"left":51.102352,
"width":29.147647857666016,
"height":56.5,
"text":"2"
},
{
"top":243.14001,
"left":80.25,
"width":56.600006103515625,
"height":56.5,
"text":"xx地\r块"
},
{
"top":243.14001,
"left":136.85,
"width":81.75001525878906,
"height":56.5,
"text":"xx地址"
},
{
"top":243.14001,
"left":218.60002,
"width":41.04997253417969,
"height":56.5,
"text":"38200"
},
{
"top":243.14001,
"left":259.65,
"width":48,
"height":56.5,
"text":"保障性\r租赁住\r宅用地"
},
{
"top":243.14001,
"left":307.65,
"width":52.95001220703125,
"height":56.5,
"text":"协议出让"
},
{
"top":243.14001,
"left":360.6,
"width":47.29998779296875,
"height":56.5,
"text":"2021 年\r3 月"
},
{
"top":243.14001,
"left":407.9,
"width":42.350006103515625,
"height":56.5,
"text":"某\r政府"
},
{
"top":243.14001,
"left":450.25,
"width":93.91693115234375,
"height":56.5,
"text":""
}
]
]
}
]
解析JSON数据:
//解析tabula读取pdf表格,将返回的数据转成jsonArray
JSONArray jsonArray = new JSONArray();
jsonArray.add(stringBuilder.toString());
List<ResidentialLand> listInfo = new ArrayList<>();
for (int i = 0; i <jsonArray.size() ; i++) {
//获取每个页
JSONArray jsonPage = jsonArray.getJSONArray(i);
//遍历页
for (int j = 0; j < jsonPage.size(); j++) {
//获取每页中的data
JSONArray dataArr = jsonPage.getJSONObject(j).getJSONArray("data");
//遍历data中的每个单元格
for (int k = 0; k < dataArr.size() ; k++) {
//遍历data中的每一条,也就是单元格中的每一行
JSONArray dataD = dataArr.getJSONArray(k);
String xuhao = dataD.getJSONObject(0).get("text").toString().replaceAll("\r","");
//如果第1个单元格的数据是序号,则跳出个这个循环,因为不能将标题存入表中
if(xuhao.contains("序号")){
continue;
}
ResidentialLand info = new ResidentialLand();
info.setId(UUIDUtils.get36UUID());
info.setDelFlag("0");
//通过下标获取每个单元格的数据,下标是固定的 ,最多有9个单元格
info.setNum(dataD.getJSONObject(0).get("text").toString().replaceAll("\r",""));
info.setParcelCode(dataD.getJSONObject(1).get("text").toString().replaceAll("\r",""));
info.setPosition(dataD.getJSONObject(2).get("text").toString().replaceAll("\r",""));
info.setMeasure(dataD.getJSONObject(3).get("text").toString().replaceAll("\r",""));
info.setType(dataD.getJSONObject(4).get("text").toString().replaceAll("\r",""));
info.setSupplyWay(dataD.getJSONObject(5).get("text").toString().replaceAll("\r",""));
info.setSupplyTime(dataD.getJSONObject(6).get("text").toString().replaceAll("\r","").replaceAll(" ",""));
info.setPushOnMain(dataD.getJSONObject(7).get("text").toString().replaceAll("\r",""));
info.setRemark(dataD.getJSONObject(8).get("text").toString().replaceAll("\r",""));
listInfo.add(info);
}
}
}
int a = 0;
for (int i = 0; i < listInfo.size(); i++) {
//判断序号如果是空,则将序号空的数据与上一条数据合并
if("".equals(listInfo.get(i).getNum()) || listInfo.get(i).getNum() == null){
a++;
listInfo.get(i-a).setParcelCode(listInfo.get(i-a).getParcelCode()+listInfo.get(i).getParcelCode());
listInfo.get(i-a).setPosition(listInfo.get(i-a).getPosition()+listInfo.get(i).getPosition());
listInfo.get(i-a).setMeasure(listInfo.get(i-a).getMeasure()+listInfo.get(i).getMeasure());
listInfo.get(i-a).setType(listInfo.get(i-a).getType()+listInfo.get(i).getType());
listInfo.get(i-a).setSupplyWay(listInfo.get(i-a).getSupplyWay()+listInfo.get(i).getSupplyWay());
listInfo.get(i-a).setSupplyTime(listInfo.get(i-a).getSupplyTime()+listInfo.get(i).getSupplyTime());
listInfo.get(i-a).setPushOnMain(listInfo.get(i-a).getPushOnMain()+listInfo.get(i).getPushOnMain());
listInfo.get(i-a).setRemark(listInfo.get(i-a).getRemark()+listInfo.get(i).getRemark());
a--;
}
}
for (int i = 0; i < listInfo.size(); i++) {
//遍历删除序号是空的数据
if("".equals(listInfo.get(i).getNum()) || listInfo.get(i).getNum() == null){
listInfo.remove(listInfo.get(i));
}
}
有需要的小伙伴可以参考下。