001/* 002 * (C) Copyright 2006-2008 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * bstefanescu 016 * 017 * $Id$ 018 */ 019 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.File; 023import java.io.IOException; 024import java.io.InputStream; 025 026import javax.xml.parsers.ParserConfigurationException; 027import javax.xml.parsers.SAXParser; 028import javax.xml.parsers.SAXParserFactory; 029 030import org.xml.sax.Attributes; 031import org.xml.sax.InputSource; 032import org.xml.sax.SAXException; 033import org.xml.sax.XMLReader; 034import org.xml.sax.helpers.DefaultHandler; 035 036/** 037 * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a> 038 */ 039public class Xml2TextHandler extends DefaultHandler { 040 041 protected static final SAXParserFactory factory = SAXParserFactory.newInstance(); 042 043 static { 044 factory.setValidating(false); 045 factory.setNamespaceAware(false); 046 } 047 048 protected SAXParser parser; 049 050 protected StringBuffer buf; 051 052 protected boolean trim = false; 053 054 public Xml2TextHandler() throws SAXException, ParserConfigurationException { 055 parser = factory.newSAXParser(); 056 XMLReader reader = parser.getXMLReader(); 057 reader.setFeature("http://xml.org/sax/features/validation", false); 058 reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 059 reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); 060 } 061 062 public SAXParser getParser() { 063 return parser; 064 } 065 066 public String parse(File file) throws SAXException, IOException { 067 parser.parse(file, this); 068 String text = buf.toString(); 069 buf = null; 070 return text; 071 } 072 073 public String parse(InputStream in) throws SAXException, IOException { 074 parser.parse(in, this); 075 String text = buf.toString(); 076 buf = null; 077 return text; 078 } 079 080 public String parse(InputSource is) throws SAXException, IOException { 081 parser.parse(is, this); 082 String text = buf.toString(); 083 buf = null; 084 return text; 085 } 086 087 public String getText() { 088 return buf.toString(); 089 } 090 091 @Override 092 public void startDocument() throws SAXException { 093 trim = false; 094 buf = new StringBuffer(); 095 } 096 097 @Override 098 public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { 099 trim = true; 100 } 101 102 @Override 103 public void endElement(String uri, String localName, String name) throws SAXException { 104 trim = true; 105 } 106 107 @Override 108 public void characters(char[] ch, int start, int length) throws SAXException { 109 // buf.append(ch, start, length); if (true) return; 110 if (trim) { 111 int i = start; 112 int end = start + length; 113 while (i < end && Character.isWhitespace(ch[i])) { 114 i++; 115 } 116 buf.append(" ").append(ch, i, length - i + start); 117 trim = false; 118 // System.out.println("["+new String(ch, i, length - i + start)+"]"); 119 } else { 120 buf.append(ch, start, length); 121 // System.out.println("{"+new String(ch, start, length)+"}"); 122 } 123 } 124 125}