001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 * 020 */ 021 022package org.nuxeo.ecm.core.convert.plugins.text.extractors; 023 024import java.io.IOException; 025import java.util.zip.ZipEntry; 026import java.util.zip.ZipInputStream; 027 028import org.xml.sax.InputSource; 029import org.xml.sax.SAXException; 030import org.xml.sax.XMLReader; 031 032/** 033 * Docx to text converter: parses the Open XML text document to read its content. 034 */ 035public class DOCX2TextConverter extends XmlZip2TextConverter { 036 037 private static final String WORD_DOCUMENT_ZIP_ENTRY_NAME = "word/document.xml"; 038 039 @Override 040 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException, 041 SAXException { 042 043 ZipEntry zipEntry = zis.getNextEntry(); 044 while (zipEntry != null) { 045 if (WORD_DOCUMENT_ZIP_ENTRY_NAME.equals(zipEntry.getName())) { 046 OpenXmlContentHandler contentHandler = new OpenXmlContentHandler(); 047 reader.setContentHandler(contentHandler); 048 reader.parse(new InputSource(zis)); 049 sb.append(contentHandler.getContent()); 050 break; 051 } 052 zipEntry = zis.getNextEntry(); 053 } 054 } 055}