001/* 002 * (C) Copyright 2006-2007 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo - initial API and implementation 018 * 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.IOException; 023import java.io.Serializable; 024import java.util.HashMap; 025import java.util.zip.ZipEntry; 026import java.util.zip.ZipInputStream; 027 028import javax.xml.parsers.ParserConfigurationException; 029 030import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 031import org.nuxeo.ecm.core.convert.api.ConversionException; 032import org.nuxeo.ecm.core.convert.extension.Converter; 033import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 034import org.xml.sax.InputSource; 035import org.xml.sax.SAXException; 036import org.xml.sax.XMLReader; 037 038/** 039 * Base class that contains SAX based text extractor fallback 040 * 041 * @author <a href="mailto:tdelprat@nuxeo.com">Tiry</a> 042 */ 043public abstract class BaseOfficeXMLTextConverter implements Converter { 044 045 public static final String MAX_SIZE = "MAX_SIZE"; 046 047 protected long maxSize4POI = 5 * 1024 * 1014; 048 049 protected BlobHolder runFallBackConverter(BlobHolder blobHolder, final String prefix) throws ConversionException { 050 051 Converter fallback = new XmlZip2TextConverter() { 052 @Override 053 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) 054 throws IOException, SAXException { 055 ZipEntry zipEntry = zis.getNextEntry(); 056 057 while (zipEntry != null) { 058 if ((zipEntry.getName().startsWith(prefix)) && (zipEntry.getName().endsWith(".xml"))) { 059 Xml2TextHandler xml2text; 060 try { 061 xml2text = new Xml2TextHandler(); 062 sb.append(xml2text.parse(new InputSource(zis))); 063 } catch (ParserConfigurationException e) { 064 throw new IOException("Error during raw XML Text extraction", e); 065 } 066 } 067 zipEntry = zis.getNextEntry(); 068 } 069 } 070 }; 071 return fallback.convert(blobHolder, new HashMap<String, Serializable>()); 072 } 073 074 @Override 075 public void init(ConverterDescriptor descriptor) { 076 String max = descriptor.getParameters().get(MAX_SIZE); 077 if (max != null) { 078 maxSize4POI = Long.parseLong(max); 079 } 080 } 081 082}