001/* 002 * (C) Copyright 2002-2007 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo - initial API and implementation 016 * 017 */ 018package org.nuxeo.ecm.core.convert.plugins.text.extractors; 019 020import java.io.IOException; 021import java.io.Serializable; 022import java.util.HashMap; 023import java.util.zip.ZipEntry; 024import java.util.zip.ZipInputStream; 025 026import javax.xml.parsers.ParserConfigurationException; 027 028import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 029import org.nuxeo.ecm.core.convert.api.ConversionException; 030import org.nuxeo.ecm.core.convert.extension.Converter; 031import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 032import org.xml.sax.InputSource; 033import org.xml.sax.SAXException; 034import org.xml.sax.XMLReader; 035 036/** 037 * Base class that contains SAX based text extractor fallback 038 * 039 * @author <a href="mailto:tdelprat@nuxeo.com">Tiry</a> 040 */ 041public abstract class BaseOfficeXMLTextConverter implements Converter { 042 043 public static final String MAX_SIZE = "MAX_SIZE"; 044 045 protected long maxSize4POI = 5 * 1024 * 1014; 046 047 protected BlobHolder runFallBackConverter(BlobHolder blobHolder, final String prefix) throws ConversionException { 048 049 Converter fallback = new XmlZip2TextConverter() { 050 @Override 051 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) 052 throws IOException, SAXException { 053 ZipEntry zipEntry = zis.getNextEntry(); 054 055 while (zipEntry != null) { 056 if ((zipEntry.getName().startsWith(prefix)) && (zipEntry.getName().endsWith(".xml"))) { 057 Xml2TextHandler xml2text; 058 try { 059 xml2text = new Xml2TextHandler(); 060 sb.append(xml2text.parse(new InputSource(zis))); 061 } catch (ParserConfigurationException e) { 062 throw new IOException("Error during raw XML Text extraction", e); 063 } 064 } 065 zipEntry = zis.getNextEntry(); 066 } 067 } 068 }; 069 return fallback.convert(blobHolder, new HashMap<String, Serializable>()); 070 } 071 072 @Override 073 public void init(ConverterDescriptor descriptor) { 074 String max = descriptor.getParameters().get(MAX_SIZE); 075 if (max != null) { 076 maxSize4POI = Long.parseLong(max); 077 } 078 } 079 080}