Java POI组件——简单提取Word、word转html、text、xml(仅支持doc,不支持docx)
        【摘要】 
                    
                        
                    
                    需要添加的库 
poi-3.15.jarpoi-ooxml-3.15.jarpoi-scratchpad-3.15.jar 
package com.poi.word;
import java.io.F...
    
    
    
    需要添加的库
- poi-3.15.jar
- poi-ooxml-3.15.jar
- poi-scratchpad-3.15.jar
package com.poi.word;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.Thumbnail;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.AbstractWordConverter;
import org.apache.poi.hwpf.converter.WordToFoConverter;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.w3c.dom.Document;
/*
 poi对word的支持比较差,像word转html、text、xml仅支持doc,不支持docx
 */
public class PoiWordClass {
    private static void extract(String path) {
        InputStream is = null;
        WordExtractor extractor = null;
        try {
            is = new FileInputStream(path);
            extractor = new WordExtractor(is);
            System.out.println("\nextractor.getText()");
            System.out.println(extractor.getText());
            System.out.println("\nextractor.getTextFromPieces()");
            System.out.println(extractor.getTextFromPieces());
            System.out.println("\nextractor.getHeaderText()");
            System.out.println(extractor.getHeaderText());
            System.out.println("\nextractor.getFooterText()");
            System.out.println(extractor.getFooterText());
            System.out.println("\nextractor.getCommentsText()");
            String[] commentsText = extractor.getCommentsText();
            for (String str : commentsText) {
                System.out.println(str);
            }
            System.out.println("\nextractor.getEndnoteText()");
            String[] endnoteText = extractor.getEndnoteText();
            for (String str : endnoteText) {
                System.out.println(str);
            }
            System.out.println("\nextractor.getFootnoteText()");
            String[] footnoteText = extractor.getFootnoteText();
            for (String str : footnoteText) {
                System.out.println(str);
            }
            System.out.println("\nextractor.getMainTextboxText()");
            String[] mainTextboxText = extractor.getMainTextboxText();
            for (String str : mainTextboxText) {
                System.out.println(str);
            }
            System.out.println("\nextractor.getParagraphText()");
            String[] paragraphText = extractor.getParagraphText();
            for (String str : paragraphText) {
                System.out.println(str);
            }
            System.out.println("\nextractor.getDocSummaryInformation().toString()");
            DocumentSummaryInformation docSummaryInformation = extractor.getDocSummaryInformation();
            System.out.println(docSummaryInformation.toString());
            System.out.println("\nextractor.getMetadataTextExtractor().toString()");
            POITextExtractor metadataTextExtractor = extractor.getMetadataTextExtractor();
            System.out.println(metadataTextExtractor.getText());
            System.out.println("\nextractor.getSummaryInformation().toString()");
            SummaryInformation summaryInformation = extractor.getSummaryInformation();
            System.out.println(summaryInformation.toString());
            print(docSummaryInformation);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    private static void print(DocumentSummaryInformation docSummaryInformation) {
        int applicationVersion = docSummaryInformation.getApplicationVersion();
        int byteCount = docSummaryInformation.getByteCount();
        int byteOrder = docSummaryInformation.getByteOrder();
        String category = docSummaryInformation.getCategory();
        int charCountWithSpaces = docSummaryInformation.getCharCountWithSpaces();
        Class<? extends DocumentSummaryInformation> cls = docSummaryInformation.getClass();
        String company = docSummaryInformation.getCompany();
        String contentStatus = docSummaryInformation.getContentStatus();
        String contentType = docSummaryInformation.getContentType();
        CustomProperties customProperties = docSummaryInformation.getCustomProperties();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] docparts = docSummaryInformation.getDocparts();
        String documentVersion = docSummaryInformation.getDocumentVersion();
        Section section = docSummaryInformation.getFirstSection();
        int format = docSummaryInformation.getFormat();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] headingPair = docSummaryInformation.getHeadingPair();
        int hiddenCount = docSummaryInformation.getHiddenCount();
        boolean hyperlinksChanged = docSummaryInformation.getHyperlinksChanged();
        String language = docSummaryInformation.getLanguage();
        int lineCount = docSummaryInformation.getLineCount();
        boolean linksDirty = docSummaryInformation.getLinksDirty();
        String manager = docSummaryInformation.getManager();
        int mmClipCount = docSummaryInformation.getMMClipCount();
        int noteCount = docSummaryInformation.getNoteCount();
        int osVersion = docSummaryInformation.getOSVersion();
        int parCount = docSummaryInformation.getParCount();
        String presentationFormat = docSummaryInformation.getPresentationFormat();
        Property[] properties = docSummaryInformation.getProperties();
        PropertyIDMap propertyIDMap = docSummaryInformation.getPropertySetIDMap();
        boolean scale = docSummaryInformation.getScale();
        int sectionCount = docSummaryInformation.getSectionCount();
        LinkedList<Section> sections = (LinkedList<Section>) docSummaryInformation.getSections();
        // org.apache.poi.hpsf.NoSingleSectionException: Property set contains 2 sections.
        // Section singleSection = docSummaryInformation.getSingleSection();
        int slideCount = docSummaryInformation.getSlideCount();
        byte[] vbaDigitalSignature = docSummaryInformation.getVBADigitalSignature();
//由于内部是HashMap<Long, String>,故反射无效
//      System.out.println("反射测试");
//      Field[] fields = cls.getFields();
//      System.out.println("fields.length = " + fields.length);
//      for (int i = 0; i < fields.length; i++) {
//          if (!fields[i].isAccessible()) {
//              fields[i].setAccessible(true);
//          }
//          try {
//              System.out.println(fields[i].getName() + " = " + fields[i].get(docSummaryInformation));
//          } catch (IllegalArgumentException e) {
//              e.printStackTrace();
//          } catch (IllegalAccessException e) {
//              e.printStackTrace();
//          }
//      }
    }
    private static void print(POITextExtractor metadataTextExtractor) {
        Class<? extends POITextExtractor> cls = metadataTextExtractor.getClass();
        POITextExtractor poiTextExtractor = metadataTextExtractor.getMetadataTextExtractor();
        String text = metadataTextExtractor.getText();
    }
    private static void print(SummaryInformation summaryInformation) {
        String applicationName = summaryInformation.getApplicationName();
        String author = summaryInformation.getAuthor();
        int byteOrder = summaryInformation.getByteOrder();
        int charCount = summaryInformation.getCharCount();
        Class<? extends SummaryInformation> cls = summaryInformation.getClass();
        ClassID classID = summaryInformation.getClassID();
        String comments = summaryInformation.getComments();
        Date createDateTime = summaryInformation.getCreateDateTime();
        long editTime = summaryInformation.getEditTime();
        Section section = summaryInformation.getFirstSection();
        int format = summaryInformation.getFormat();
        String keywords = summaryInformation.getKeywords();
        String lastAuthor = summaryInformation.getLastAuthor();
        Date lastPrinted = summaryInformation.getLastPrinted();
        Date lastSaveDateTime = summaryInformation.getLastSaveDateTime();
        int osVersion = summaryInformation.getOSVersion();
        int pageCount = summaryInformation.getPageCount();
        Property[] properties = summaryInformation.getProperties();
        PropertyIDMap propertySetIDMap = summaryInformation.getPropertySetIDMap();
        String recNumber = summaryInformation.getRevNumber();
        int sectionCount = summaryInformation.getSectionCount();
        ArrayList<Section> sections = (ArrayList<Section>) summaryInformation.getSections();
        int security = summaryInformation.getSecurity();
        Section singleSection = summaryInformation.getSingleSection();
        String subject = summaryInformation.getSubject();
        String template = summaryInformation.getTemplate();
        byte[] thumbnail = summaryInformation.getThumbnail();
        Thumbnail thumbnailThumbnail = summaryInformation.getThumbnailThumbnail();
        String title = summaryInformation.getTitle();
        int wordCount = summaryInformation.getWordCount();
    }
    enum ConverterType {
        HTML,
        TEXT,
        XML
    }
    private static void convert(String srcPath, String destPathWithoutExtension, ConverterType type) {
        InputStream is = null;
        Writer writer = null;
        try {
            is = new FileInputStream(srcPath);
            HWPFDocument hwpfDocument = new HWPFDocument(is);
            Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            AbstractWordConverter converter = null;
            String method = null;
            switch (type) {
            case HTML:
                converter = new WordToHtmlConverter(document);
                method = "html";
                destPathWithoutExtension += ".html";
                break;
            case TEXT:
                converter = new WordToTextConverter(document);
                method = "text";
                destPathWithoutExtension += ".txt";
                break;
            case XML:
                converter = new WordToFoConverter(document);
                method = "xml";
                destPathWithoutExtension += ".xml";
                break;
            }
            converter.processDocument(hwpfDocument);
            Transformer transformer = TransformerFactory.newInstance().newTransformer();
            writer = new FileWriter(destPathWithoutExtension);
            transformer.setOutputProperty(OutputKeys.ENCODING, "gbk");
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            transformer.setOutputProperty(OutputKeys.METHOD, method);
            DOMSource domSource = new DOMSource(converter.getDocument());
            StreamResult streamResult = new StreamResult(writer);
            transformer.transform(domSource, streamResult);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerFactoryConfigurationError e) {
            e.printStackTrace();
        } catch (TransformerException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (writer != null) {
                try {
                    writer.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    private static void convertToHtml(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.HTML);
    }
    private static void convertToText(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.TEXT);
    }
    private static void convertToXml(String srcPath, String dstPath) {
        convert(srcPath, dstPath, ConverterType.XML);
    }
    public static void main(String[] args) {
        String path = "test.doc";
        extract(path);
        convertToHtml(path, "test");
        convertToText(path, "test");
        convertToXml(path, "test");
    }
}
  
 - 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
- 232
- 233
- 234
- 235
- 236
- 237
- 238
- 239
- 240
- 241
- 242
- 243
- 244
- 245
- 246
- 247
- 248
- 249
- 250
- 251
- 252
- 253
- 254
- 255
- 256
- 257
- 258
- 259
- 260
- 261
- 262
- 263
- 264
- 265
- 266
- 267
- 268
- 269
- 270
- 271
- 272
- 273
- 274
- 275
- 276
- 277
- 278
- 279
- 280
- 281
- 282
- 283
- 284
- 285
- 286
- 287
- 288
- 289
- 290
- 291
- 292
- 293
- 294
- 295
- 296
- 297
- 298
- 299
- 300
- 301
- 302
- 303
- 304
- 305
- 306
- 307
文章来源: blog.csdn.net,作者:福州-司马懿,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/chy555chy/article/details/53247563
        【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
            cloudbbs@huaweicloud.com
        
        
        
        
        
        
        - 点赞
- 收藏
- 关注作者
 
             
           
评论(0)