001/*
002 * (C) Copyright 2002-2007 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo - initial API and implementation
016 *
017 */
018package org.nuxeo.ecm.core.convert.plugins.text.extractors;
019
020import java.io.IOException;
021import java.io.Serializable;
022import java.util.HashMap;
023import java.util.zip.ZipEntry;
024import java.util.zip.ZipInputStream;
025
026import javax.xml.parsers.ParserConfigurationException;
027
028import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
029import org.nuxeo.ecm.core.convert.api.ConversionException;
030import org.nuxeo.ecm.core.convert.extension.Converter;
031import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
032import org.xml.sax.InputSource;
033import org.xml.sax.SAXException;
034import org.xml.sax.XMLReader;
035
036/**
037 * Base class that contains SAX based text extractor fallback
038 *
039 * @author <a href="mailto:tdelprat@nuxeo.com">Tiry</a>
040 */
041public abstract class BaseOfficeXMLTextConverter implements Converter {
042
043    public static final String MAX_SIZE = "MAX_SIZE";
044
045    protected long maxSize4POI = 5 * 1024 * 1014;
046
047    protected BlobHolder runFallBackConverter(BlobHolder blobHolder, final String prefix) throws ConversionException {
048
049        Converter fallback = new XmlZip2TextConverter() {
050            @Override
051            protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb)
052                    throws IOException, SAXException {
053                ZipEntry zipEntry = zis.getNextEntry();
054
055                while (zipEntry != null) {
056                    if ((zipEntry.getName().startsWith(prefix)) && (zipEntry.getName().endsWith(".xml"))) {
057                        Xml2TextHandler xml2text;
058                        try {
059                            xml2text = new Xml2TextHandler();
060                            sb.append(xml2text.parse(new InputSource(zis)));
061                        } catch (ParserConfigurationException e) {
062                            throw new IOException("Error during raw XML Text extraction", e);
063                        }
064                    }
065                    zipEntry = zis.getNextEntry();
066                }
067            }
068        };
069        return fallback.convert(blobHolder, new HashMap<String, Serializable>());
070    }
071
072    @Override
073    public void init(ConverterDescriptor descriptor) {
074        String max = descriptor.getParameters().get(MAX_SIZE);
075        if (max != null) {
076            maxSize4POI = Long.parseLong(max);
077        }
078    }
079
080}