001/* 002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Antoine Taillefer 017 */ 018 019package org.nuxeo.ecm.core.convert.plugins.text.extractors; 020 021import java.io.IOException; 022import java.io.Serializable; 023import java.util.Map; 024import java.util.zip.ZipInputStream; 025 026import javax.xml.parsers.ParserConfigurationException; 027import javax.xml.parsers.SAXParser; 028import javax.xml.parsers.SAXParserFactory; 029 030import org.nuxeo.ecm.core.api.Blobs; 031import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 032import org.nuxeo.ecm.core.convert.api.ConversionException; 033import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 034import org.nuxeo.ecm.core.convert.extension.Converter; 035import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 036import org.xml.sax.SAXException; 037import org.xml.sax.XMLReader; 038 039/** 040 * XML zip to text converter: parses the XML zip entries to read their content. 041 */ 042public abstract class XmlZip2TextConverter implements Converter { 043 044 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 045 046 SAXParserFactory parserFactory = SAXParserFactory.newInstance(); 047 parserFactory.setValidating(false); 048 049 try { 050 SAXParser parser = parserFactory.newSAXParser(); 051 XMLReader reader = parser.getXMLReader(); 052 reader.setFeature("http://xml.org/sax/features/validation", false); 053 reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 054 reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); 055 056 StringBuilder sb = new StringBuilder(); 057 UnclosableZipInputStream zis = new UnclosableZipInputStream(blobHolder.getBlob().getStream()); 058 // ZipInputStream zis = new ZipInputStream( 059 // blobHolder.getBlob().getStream()); 060 try { 061 readXmlZipContent(zis, reader, sb); 062 } finally { 063 zis.doClose(); 064 } 065 return new SimpleCachableBlobHolder(Blobs.createBlob(sb.toString())); 066 } catch (IOException | ParserConfigurationException | SAXException e) { 067 throw new ConversionException("Error during OpenXml2Text conversion", e); 068 } 069 } 070 071 public void init(ConverterDescriptor descriptor) { 072 } 073 074 protected abstract void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) 075 throws IOException, SAXException; 076}