001/*
002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Antoine Taillefer
017 *
018 */
019
020package org.nuxeo.ecm.core.convert.plugins.text.extractors;
021
022import java.io.IOException;
023import java.util.zip.ZipEntry;
024import java.util.zip.ZipInputStream;
025
026import org.xml.sax.InputSource;
027import org.xml.sax.SAXException;
028import org.xml.sax.XMLReader;
029
030/**
031 * Docx to text converter: parses the Open XML text document to read its content.
032 */
033public class DOCX2TextConverter extends XmlZip2TextConverter {
034
035    private static final String WORD_DOCUMENT_ZIP_ENTRY_NAME = "word/document.xml";
036
037    protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException,
038            SAXException {
039
040        ZipEntry zipEntry = zis.getNextEntry();
041        while (zipEntry != null) {
042            if (WORD_DOCUMENT_ZIP_ENTRY_NAME.equals(zipEntry.getName())) {
043                OpenXmlContentHandler contentHandler = new OpenXmlContentHandler();
044                reader.setContentHandler(contentHandler);
045                reader.parse(new InputSource(zis));
046                sb.append(contentHandler.getContent());
047                break;
048            }
049            zipEntry = zis.getNextEntry();
050        }
051    }
052}