001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo
018 *     Antoine Taillefer
019 *
020 */
021
022package org.nuxeo.ecm.core.convert.plugins.text.extractors;
023
024import java.io.IOException;
025import java.util.zip.ZipEntry;
026import java.util.zip.ZipInputStream;
027
028import org.xml.sax.InputSource;
029import org.xml.sax.SAXException;
030import org.xml.sax.XMLReader;
031
032/**
033 * Docx to text converter: parses the Open XML text document to read its content.
034 */
035public class DOCX2TextConverter extends XmlZip2TextConverter {
036
037    private static final String WORD_DOCUMENT_ZIP_ENTRY_NAME = "word/document.xml";
038
039    protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException,
040            SAXException {
041
042        ZipEntry zipEntry = zis.getNextEntry();
043        while (zipEntry != null) {
044            if (WORD_DOCUMENT_ZIP_ENTRY_NAME.equals(zipEntry.getName())) {
045                OpenXmlContentHandler contentHandler = new OpenXmlContentHandler();
046                reader.setContentHandler(contentHandler);
047                reader.parse(new InputSource(zis));
048                sb.append(contentHandler.getContent());
049                break;
050            }
051            zipEntry = zis.getNextEntry();
052        }
053    }
054}