001/* 002 * (C) Copyright 2006-2007 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo - initial API and implementation 018 * 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.io.Serializable; 027import java.util.Map; 028 029import net.htmlparser.jericho.Renderer; 030import net.htmlparser.jericho.Source; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.nuxeo.ecm.core.api.Blob; 035import org.nuxeo.ecm.core.api.Blobs; 036import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 037import org.nuxeo.ecm.core.api.impl.blob.StringBlob; 038import org.nuxeo.ecm.core.convert.api.ConversionException; 039import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 040import org.nuxeo.ecm.core.convert.extension.Converter; 041import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 042 043/** 044 * Extract the text content of HTML documents while trying to respect the paragraph structure. 045 * 046 * @author <a href="mailto:troger@nuxeo.com">Thomas Roger</a> 047 * @author <a href="mailto:ogrisel@nuxeo.com">Olivier Grisel</a> 048 */ 049public class Html2TextConverter implements Converter { 050 051 private static final Log log = LogFactory.getLog(Html2TextConverter.class); 052 053 @Override 054 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 055 056 InputStream stream = null; 057 try { 058 Blob blob = blobHolder.getBlob(); 059 // if the underlying source is unambiguously decoded, access the 060 // decoded string directly 061 Source source; 062 if (blob instanceof StringBlob) { 063 source = new Source(blob.getString()); 064 } else if (blob.getEncoding() != null) { 065 Reader reader = new InputStreamReader(blob.getStream(), blob.getEncoding()); 066 source = new Source(reader); 067 } else { 068 // otherwise use the parser charset heuristic to decode properly 069 source = new Source(blob.getStream()); 070 } 071 Renderer renderer = source.getRenderer(); 072 renderer.setIncludeHyperlinkURLs(false); 073 renderer.setDecorateFontStyles(false); 074 String text = renderer.toString(); 075 text = text.replaceAll("\r\n", "\n"); // unix end of line 076 text = text.replaceAll(" *\n", "\n"); // clean trailing spaces 077 text = text.replaceAll("\\n\\n+", "\n\n"); // clean multiple lines 078 text = text.trim(); 079 return new SimpleCachableBlobHolder(Blobs.createBlob(text)); 080 } catch (IOException e) { 081 throw new ConversionException("Error during Html2Text conversion", e); 082 } finally { 083 if (stream != null) { 084 try { 085 stream.close(); 086 } catch (IOException e) { 087 log.error("Error while closing Blob stream", e); 088 } 089 } 090 } 091 } 092 093 @Override 094 public void init(ConverterDescriptor descriptor) { 095 } 096 097}