001/*
002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Antoine Taillefer
017 */
018
019package org.nuxeo.ecm.core.convert.plugins.text.extractors;
020
021import java.io.IOException;
022import java.io.Serializable;
023import java.util.Map;
024import java.util.zip.ZipInputStream;
025
026import javax.xml.parsers.ParserConfigurationException;
027import javax.xml.parsers.SAXParser;
028import javax.xml.parsers.SAXParserFactory;
029
030import org.nuxeo.ecm.core.api.Blobs;
031import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
032import org.nuxeo.ecm.core.convert.api.ConversionException;
033import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder;
034import org.nuxeo.ecm.core.convert.extension.Converter;
035import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
036import org.xml.sax.SAXException;
037import org.xml.sax.XMLReader;
038
039/**
040 * XML zip to text converter: parses the XML zip entries to read their content.
041 */
042public abstract class XmlZip2TextConverter implements Converter {
043
044    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
045
046        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
047        parserFactory.setValidating(false);
048
049        try {
050            SAXParser parser = parserFactory.newSAXParser();
051            XMLReader reader = parser.getXMLReader();
052            reader.setFeature("http://xml.org/sax/features/validation", false);
053            reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
054            reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
055
056            StringBuilder sb = new StringBuilder();
057            UnclosableZipInputStream zis = new UnclosableZipInputStream(blobHolder.getBlob().getStream());
058            // ZipInputStream zis = new ZipInputStream(
059            // blobHolder.getBlob().getStream());
060            try {
061                readXmlZipContent(zis, reader, sb);
062            } finally {
063                zis.doClose();
064            }
065            return new SimpleCachableBlobHolder(Blobs.createBlob(sb.toString()));
066        } catch (IOException | ParserConfigurationException | SAXException e) {
067            throw new ConversionException("Error during OpenXml2Text conversion", e);
068        }
069    }
070
071    public void init(ConverterDescriptor descriptor) {
072    }
073
074    protected abstract void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb)
075            throws IOException, SAXException;
076}