Java POI组件——简单提取Word、word转html、text、xml(仅支持doc,不支持docx)

举报
福州司马懿 发表于 2021/11/19 03:47:08 2021/11/19
【摘要】 需要添加的库 poi-3.15.jarpoi-ooxml-3.15.jarpoi-scratchpad-3.15.jar package com.poi.word; import java.io.F...

需要添加的库

  • poi-3.15.jar
  • poi-ooxml-3.15.jar
  • poi-scratchpad-3.15.jar
package com.poi.word;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.POITextExtractor;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.Thumbnail;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.AbstractWordConverter;
import org.apache.poi.hwpf.converter.WordToFoConverter;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.w3c.dom.Document;

/*
 poi对word的支持比较差,像word转html、text、xml仅支持doc,不支持docx
 */
public class PoiWordClass {
    private static void extract(String path) {
        InputStream is = null;
        WordExtractor extractor = null;
        try {
            is = new FileInputStream(path);
            extractor = new WordExtractor(is);

            System.out.println("\nextractor.getText()");
            System.out.println(extractor.getText());

            System.out.println("\nextractor.getTextFromPieces()");
            System.out.println(extractor.getTextFromPieces());

            System.out.println("\nextractor.getHeaderText()");
            System.out.println(extractor.getHeaderText());

            System.out.println("\nextractor.getFooterText()");
            System.out.println(extractor.getFooterText());

            System.out.println("\nextractor.getCommentsText()");
            String[] commentsText = extractor.getCommentsText();
            for (String str : commentsText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getEndnoteText()");
            String[] endnoteText = extractor.getEndnoteText();
            for (String str : endnoteText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getFootnoteText()");
            String[] footnoteText = extractor.getFootnoteText();
            for (String str : footnoteText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getMainTextboxText()");
            String[] mainTextboxText = extractor.getMainTextboxText();
            for (String str : mainTextboxText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getParagraphText()");
            String[] paragraphText = extractor.getParagraphText();
            for (String str : paragraphText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getDocSummaryInformation().toString()");
            DocumentSummaryInformation docSummaryInformation = extractor.getDocSummaryInformation();
            System.out.println(docSummaryInformation.toString());

            System.out.println("\nextractor.getMetadataTextExtractor().toString()");
            POITextExtractor metadataTextExtractor = extractor.getMetadataTextExtractor();
            System.out.println(metadataTextExtractor.getText());

            System.out.println("\nextractor.getSummaryInformation().toString()");
            SummaryInformation summaryInformation = extractor.getSummaryInformation();
            System.out.println(summaryInformation.toString());

            print(docSummaryInformation);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void print(DocumentSummaryInformation docSummaryInformation) {
        int applicationVersion = docSummaryInformation.getApplicationVersion();
        int byteCount = docSummaryInformation.getByteCount();
        int byteOrder = docSummaryInformation.getByteOrder();
        String category = docSummaryInformation.getCategory();
        int charCountWithSpaces = docSummaryInformation.getCharCountWithSpaces();
        Class<? extends DocumentSummaryInformation> cls = docSummaryInformation.getClass();
        String company = docSummaryInformation.getCompany();
        String contentStatus = docSummaryInformation.getContentStatus();
        String contentType = docSummaryInformation.getContentType();
        CustomProperties customProperties = docSummaryInformation.getCustomProperties();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] docparts = docSummaryInformation.getDocparts();
        String documentVersion = docSummaryInformation.getDocumentVersion();
        Section section = docSummaryInformation.getFirstSection();
        int format = docSummaryInformation.getFormat();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] headingPair = docSummaryInformation.getHeadingPair();
        int hiddenCount = docSummaryInformation.getHiddenCount();
        boolean hyperlinksChanged = docSummaryInformation.getHyperlinksChanged();
        String language = docSummaryInformation.getLanguage();
        int lineCount = docSummaryInformation.getLineCount();
        boolean linksDirty = docSummaryInformation.getLinksDirty();
        String manager = docSummaryInformation.getManager();
        int mmClipCount = docSummaryInformation.getMMClipCount();
        int noteCount = docSummaryInformation.getNoteCount();
        int osVersion = docSummaryInformation.getOSVersion();
        int parCount = docSummaryInformation.getParCount();
        String presentationFormat = docSummaryInformation.getPresentationFormat();
        Property[] properties = docSummaryInformation.getProperties();
        PropertyIDMap propertyIDMap = docSummaryInformation.getPropertySetIDMap();
        boolean scale = docSummaryInformation.getScale();
        int sectionCount = docSummaryInformation.getSectionCount();
        LinkedList<Section> sections = (LinkedList<Section>) docSummaryInformation.getSections();
        // org.apache.poi.hpsf.NoSingleSectionException: Property set contains 2 sections.
        // Section singleSection = docSummaryInformation.getSingleSection();
        int slideCount = docSummaryInformation.getSlideCount();
        byte[] vbaDigitalSignature = docSummaryInformation.getVBADigitalSignature();

//由于内部是HashMap<Long, String>,故反射无效
//      System.out.println("反射测试");
//      Field[] fields = cls.getFields();
//      System.out.println("fields.length = " + fields.length);
//      for (int i = 0; i < fields.length; i++) {
//          if (!fields[i].isAccessible()) {
//              fields[i].setAccessible(true);
//          }
//          try {
//              System.out.println(fields[i].getName() + " = " + fields[i].get(docSummaryInformation));
//          } catch (IllegalArgumentException e) {
//              e.printStackTrace();
//          } catch (IllegalAccessException e) {
//              e.printStackTrace();
//          }
//      }
    }

    private static void print(POITextExtractor metadataTextExtractor) {
        Class<? extends POITextExtractor> cls = metadataTextExtractor.getClass();
        POITextExtractor poiTextExtractor = metadataTextExtractor.getMetadataTextExtractor();
        String text = metadataTextExtractor.getText();
    }

    private static void print(SummaryInformation summaryInformation) {
        String applicationName = summaryInformation.getApplicationName();
        String author = summaryInformation.getAuthor();
        int byteOrder = summaryInformation.getByteOrder();
        int charCount = summaryInformation.getCharCount();
        Class<? extends SummaryInformation> cls = summaryInformation.getClass();
        ClassID classID = summaryInformation.getClassID();
        String comments = summaryInformation.getComments();
        Date createDateTime = summaryInformation.getCreateDateTime();
        long editTime = summaryInformation.getEditTime();
        Section section = summaryInformation.getFirstSection();
        int format = summaryInformation.getFormat();
        String keywords = summaryInformation.getKeywords();
        String lastAuthor = summaryInformation.getLastAuthor();
        Date lastPrinted = summaryInformation.getLastPrinted();
        Date lastSaveDateTime = summaryInformation.getLastSaveDateTime();
        int osVersion = summaryInformation.getOSVersion();
        int pageCount = summaryInformation.getPageCount();
        Property[] properties = summaryInformation.getProperties();
        PropertyIDMap propertySetIDMap = summaryInformation.getPropertySetIDMap();
        String recNumber = summaryInformation.getRevNumber();
        int sectionCount = summaryInformation.getSectionCount();
        ArrayList<Section> sections = (ArrayList<Section>) summaryInformation.getSections();
        int security = summaryInformation.getSecurity();
        Section singleSection = summaryInformation.getSingleSection();
        String subject = summaryInformation.getSubject();
        String template = summaryInformation.getTemplate();
        byte[] thumbnail = summaryInformation.getThumbnail();
        Thumbnail thumbnailThumbnail = summaryInformation.getThumbnailThumbnail();
        String title = summaryInformation.getTitle();
        int wordCount = summaryInformation.getWordCount();
    }

    enum ConverterType {
        HTML,
        TEXT,
        XML
    }

    private static void convert(String srcPath, String destPathWithoutExtension, ConverterType type) {
        InputStream is = null;
        Writer writer = null;

        try {
            is = new FileInputStream(srcPath);
            HWPFDocument hwpfDocument = new HWPFDocument(is);
            Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            AbstractWordConverter converter = null;
            String method = null;
            switch (type) {
            case HTML:
                converter = new WordToHtmlConverter(document);
                method = "html";
                destPathWithoutExtension += ".html";
                break;
            case TEXT:
                converter = new WordToTextConverter(document);
                method = "text";
                destPathWithoutExtension += ".txt";
                break;
            case XML:
                converter = new WordToFoConverter(document);
                method = "xml";
                destPathWithoutExtension += ".xml";
                break;
            }
            converter.processDocument(hwpfDocument);
            Transformer transformer = TransformerFactory.newInstance().newTransformer();
            writer = new FileWriter(destPathWithoutExtension);
            transformer.setOutputProperty(OutputKeys.ENCODING, "gbk");
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            transformer.setOutputProperty(OutputKeys.METHOD, method);
            DOMSource domSource = new DOMSource(converter.getDocument());
            StreamResult streamResult = new StreamResult(writer);
            transformer.transform(domSource, streamResult);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerFactoryConfigurationError e) {
            e.printStackTrace();
        } catch (TransformerException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (writer != null) {
                try {
                    writer.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void convertToHtml(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.HTML);
    }

    private static void convertToText(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.TEXT);
    }

    private static void convertToXml(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.XML);
    }

    public static void main(String[] args) {
        String path = "test.doc";
        extract(path);
        convertToHtml(path, "test");
        convertToText(path, "test");
        convertToXml(path, "test");
    }
}
  
 
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307

文章来源: blog.csdn.net,作者:福州-司马懿,版权归原作者所有,如需转载,请联系作者。

原文链接:blog.csdn.net/chy555chy/article/details/53247563

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。