001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 */ 020 021package org.nuxeo.ecm.core.convert.plugins.text.extractors; 022 023import java.io.IOException; 024import java.io.Serializable; 025import java.util.Map; 026import java.util.zip.ZipInputStream; 027 028import javax.xml.parsers.ParserConfigurationException; 029import javax.xml.parsers.SAXParser; 030import javax.xml.parsers.SAXParserFactory; 031 032import org.nuxeo.ecm.core.api.Blobs; 033import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 034import org.nuxeo.ecm.core.convert.api.ConversionException; 035import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 036import org.nuxeo.ecm.core.convert.extension.Converter; 037import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 038import org.xml.sax.SAXException; 039import org.xml.sax.XMLReader; 040 041/** 042 * XML zip to text converter: parses the XML zip entries to read their content. 043 */ 044public abstract class XmlZip2TextConverter implements Converter { 045 046 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 047 048 SAXParserFactory parserFactory = SAXParserFactory.newInstance(); 049 parserFactory.setValidating(false); 050 051 try { 052 SAXParser parser = parserFactory.newSAXParser(); 053 XMLReader reader = parser.getXMLReader(); 054 reader.setFeature("http://xml.org/sax/features/validation", false); 055 reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 056 reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); 057 058 StringBuilder sb = new StringBuilder(); 059 UnclosableZipInputStream zis = new UnclosableZipInputStream(blobHolder.getBlob().getStream()); 060 // ZipInputStream zis = new ZipInputStream( 061 // blobHolder.getBlob().getStream()); 062 try { 063 readXmlZipContent(zis, reader, sb); 064 } finally { 065 zis.doClose(); 066 } 067 return new SimpleCachableBlobHolder(Blobs.createBlob(sb.toString())); 068 } catch (IOException | ParserConfigurationException | SAXException e) { 069 throw new ConversionException("Error during OpenXml2Text conversion", e); 070 } 071 } 072 073 public void init(ConverterDescriptor descriptor) { 074 } 075 076 protected abstract void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) 077 throws IOException, SAXException; 078}