001/* 002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Antoine Taillefer 017 * 018 */ 019 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.IOException; 023import java.util.zip.ZipEntry; 024import java.util.zip.ZipInputStream; 025 026import org.xml.sax.InputSource; 027import org.xml.sax.SAXException; 028import org.xml.sax.XMLReader; 029 030/** 031 * Docx to text converter: parses the Open XML text document to read its content. 032 */ 033public class DOCX2TextConverter extends XmlZip2TextConverter { 034 035 private static final String WORD_DOCUMENT_ZIP_ENTRY_NAME = "word/document.xml"; 036 037 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException, 038 SAXException { 039 040 ZipEntry zipEntry = zis.getNextEntry(); 041 while (zipEntry != null) { 042 if (WORD_DOCUMENT_ZIP_ENTRY_NAME.equals(zipEntry.getName())) { 043 OpenXmlContentHandler contentHandler = new OpenXmlContentHandler(); 044 reader.setContentHandler(contentHandler); 045 reader.parse(new InputSource(zis)); 046 sb.append(contentHandler.getContent()); 047 break; 048 } 049 zipEntry = zis.getNextEntry(); 050 } 051 } 052}