001/* 002 * (C) Copyright 2002-2007 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo - initial API and implementation 016 * 017 */ 018package org.nuxeo.ecm.core.convert.plugins.text.extractors; 019 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.io.Reader; 024import java.io.Serializable; 025import java.util.Map; 026 027import net.htmlparser.jericho.Renderer; 028import net.htmlparser.jericho.Source; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.nuxeo.ecm.core.api.Blob; 033import org.nuxeo.ecm.core.api.Blobs; 034import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 035import org.nuxeo.ecm.core.api.impl.blob.StringBlob; 036import org.nuxeo.ecm.core.convert.api.ConversionException; 037import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 038import org.nuxeo.ecm.core.convert.extension.Converter; 039import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 040 041/** 042 * Extract the text content of HTML documents while trying to respect the paragraph structure. 043 * 044 * @author <a href="mailto:troger@nuxeo.com">Thomas Roger</a> 045 * @author <a href="mailto:ogrisel@nuxeo.com">Olivier Grisel</a> 046 */ 047public class Html2TextConverter implements Converter { 048 049 private static final Log log = LogFactory.getLog(Html2TextConverter.class); 050 051 @Override 052 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 053 054 InputStream stream = null; 055 try { 056 Blob blob = blobHolder.getBlob(); 057 // if the underlying source is unambiguously decoded, access the 058 // decoded string directly 059 Source source; 060 if (blob instanceof StringBlob) { 061 source = new Source(blob.getString()); 062 } else if (blob.getEncoding() != null) { 063 Reader reader = new InputStreamReader(blob.getStream(), blob.getEncoding()); 064 source = new Source(reader); 065 } else { 066 // otherwise use the parser charset heuristic to decode properly 067 source = new Source(blob.getStream()); 068 } 069 Renderer renderer = source.getRenderer(); 070 renderer.setIncludeHyperlinkURLs(false); 071 renderer.setDecorateFontStyles(false); 072 String text = renderer.toString(); 073 text = text.replaceAll("\r\n", "\n"); // unix end of line 074 text = text.replaceAll(" *\n", "\n"); // clean trailing spaces 075 text = text.replaceAll("\\n\\n+", "\n\n"); // clean multiple lines 076 text = text.trim(); 077 return new SimpleCachableBlobHolder(Blobs.createBlob(text)); 078 } catch (IOException e) { 079 throw new ConversionException("Error during Html2Text conversion", e); 080 } finally { 081 if (stream != null) { 082 try { 083 stream.close(); 084 } catch (IOException e) { 085 log.error("Error while closing Blob stream", e); 086 } 087 } 088 } 089 } 090 091 @Override 092 public void init(ConverterDescriptor descriptor) { 093 } 094 095}