poi怎么样读取word文档的目录结构

码拜

9年 ago

如下图:
本人使用POI读取一个文档, HWPFDocument 类只能按段落读取,没办法识别文档的结构,求大神帮忙,或告知POI有没有这样的功能.

上面的描述可能有点太粗糙了, 在这里本人重新啰嗦一下:
本人需要读取一个word 文档, word文档中包含很本人种”标题“(1-9号标题, work文档中的目录也是按标题来生成的,这么说您应该懂了”本人所说的标题是什么东西了吧!”), 好吧, 再附图

下面看本人写的实现:
HWPFDocument hwpfd = new HWPFDocument(is);
WordExtractor wordExtractor = new WordExtractor(hwpfd);
String[] paragraph = wordExtractor.getParagraphText();
for (int i = 0; i < paragraph.length; i++) {
System.out.println(paragraph[i]);
}
这样的话控制后输出的是文档中的全部内容, 是按段落(Paragraph)区分开来的, 但这样无法满足本人的需求, 本人希望是程序知道整个文档的目录结构,也就是知道哪个段落的父亲是谁,兄弟是谁,有哪些孩子等关系.
好了, 大神,您懂了么?

解决方案

使用poi获取word 标题即可。嵌套结构就是标题的大小级别。
POI获取WORD标题
开源中国
开源中国
发表于 2014-08-23 00:10:28

public class WordUtil {
//2003
public static List<String> getWordTitles2003(String path) throws IOException{
File file = new File(path);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf(“.”));
InputStream is = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(is);
Range r = doc.getRange();
List<String> list = new ArrayList<String>();
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
// check if style index is greater than total number of styles
int numStyles =doc.getStyleSheet().numStyles();
int styleIndex = p.getStyleIndex();
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();
if (styleName!=null&&styleName.contains(“标题”)) {
// write style name and associated text
// System.out.println(styleName +”->”+ p.text());
//
System.out.println(p.text());
String text = p.text();
list.add(text);
}
}
}
//TODO 图表跟图片不一样，需另外处理
//得到word数据流
byte [] dataStream = doc.getDataStream();
//用于在一段范围内获得段落数
int numCharacterRuns = r.numCharacterRuns();
// System.out.println(“CharacterRuns 数:”+numCharacterRuns);
//负责图像提取和确定一些文件某块能否包含嵌入的图像。
PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);
//文章图片编号
int i = 1;
for(int j=0 ; j<numCharacterRuns ; j++){
//这个类表示一个文本运行，有着共同的属性。
CharacterRun run = r.getCharacterRun(j);
//能否存在图片
boolean bool = table.hasPicture(run);
if(bool){
//返回图片对象绑定到指定的CharacterRun
Picture pic = table.extractPicture(run, true);
//图片的内容字节写入到指定的输出流。
pic.writeImageContent(new FileOutputStream(“E:\temp”+filename+”_”+i+”.jpg”));
i++;
}
}
return list;
}
public static List<String> getWordTitles2007(String path) throws IOException{
InputStream is = new FileInputStream(path);
//2007
//
OPCPackage p = POIXMLDocument.openPackage(path);
//
XWPFWordExtractor e = new XWPFWordExtractor(p);
//
POIXMLDocument doc = e.getDocument();
List<String> list = new ArrayList<String>();
XWPFDocument doc = new XWPFDocument(is);
XWPFParagraph[]paras = doc.getParagraphs();
for (XWPFParagraph graph : paras) {
String text = graph.getParagraphText();
String style = graph.getStyle();
if (“1″.equals(style)) {
//
System.out.println(text+”–[“+style+”]”);
}else if (“2″.equals(style)) {
//
System.out.println(text+”–[“+style+”]”);
}else if (“3″.equals(style)) {
//
System.out.println(text+”–[“+style+”]”);
}else{
continue;
}
list.add(text);
}
return list;
}
public static void main(String[] args) throws IOException {
String path =”E:/temp/poi_test.doc”;
List<String> list = new ArrayList<String>();
if (path.endsWith(“.doc”)) {
list = getWordTitles2003(path);
}else if (path.endsWith(“.docx”)) {
list = getWordTitles2007(path);
}
for (String title : list) {
System.out.println(title);
}
}
}