001/*
002 * (C) Copyright 2006-2007 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo - initial API and implementation
018 *
019 */
020package org.nuxeo.ecm.core.convert.plugins.text.extractors;
021
022import java.io.IOException;
023import java.io.Serializable;
024import java.util.HashMap;
025import java.util.zip.ZipEntry;
026import java.util.zip.ZipInputStream;
027
028import javax.xml.parsers.ParserConfigurationException;
029
030import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
031import org.nuxeo.ecm.core.convert.api.ConversionException;
032import org.nuxeo.ecm.core.convert.extension.Converter;
033import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
034import org.xml.sax.InputSource;
035import org.xml.sax.SAXException;
036import org.xml.sax.XMLReader;
037
038/**
039 * Base class that contains SAX based text extractor fallback
040 *
041 * @author <a href="mailto:tdelprat@nuxeo.com">Tiry</a>
042 */
043public abstract class BaseOfficeXMLTextConverter implements Converter {
044
045    public static final String MAX_SIZE = "MAX_SIZE";
046
047    protected long maxSize4POI = 5 * 1024 * 1014;
048
049    protected BlobHolder runFallBackConverter(BlobHolder blobHolder, final String prefix) throws ConversionException {
050
051        Converter fallback = new XmlZip2TextConverter() {
052            @Override
053            protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb)
054                    throws IOException, SAXException {
055                ZipEntry zipEntry = zis.getNextEntry();
056
057                while (zipEntry != null) {
058                    if ((zipEntry.getName().startsWith(prefix)) && (zipEntry.getName().endsWith(".xml"))) {
059                        Xml2TextHandler xml2text;
060                        try {
061                            xml2text = new Xml2TextHandler();
062                            sb.append(xml2text.parse(new InputSource(zis)));
063                        } catch (ParserConfigurationException e) {
064                            throw new IOException("Error during raw XML Text extraction", e);
065                        }
066                    }
067                    zipEntry = zis.getNextEntry();
068                }
069            }
070        };
071        return fallback.convert(blobHolder, new HashMap<String, Serializable>());
072    }
073
074    @Override
075    public void init(ConverterDescriptor descriptor) {
076        String max = descriptor.getParameters().get(MAX_SIZE);
077        if (max != null) {
078            maxSize4POI = Long.parseLong(max);
079        }
080    }
081
082}