001/* 002 * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * bstefanescu 018 * 019 * $Id$ 020 */ 021 022package org.nuxeo.ecm.core.convert.plugins.text.extractors; 023 024import java.io.File; 025import java.io.IOException; 026import java.io.InputStream; 027 028import javax.xml.parsers.ParserConfigurationException; 029import javax.xml.parsers.SAXParser; 030import javax.xml.parsers.SAXParserFactory; 031 032import org.xml.sax.Attributes; 033import org.xml.sax.InputSource; 034import org.xml.sax.SAXException; 035import org.xml.sax.XMLReader; 036import org.xml.sax.helpers.DefaultHandler; 037 038/** 039 * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a> 040 */ 041public class Xml2TextHandler extends DefaultHandler { 042 043 protected static final SAXParserFactory factory = SAXParserFactory.newInstance(); 044 045 static { 046 factory.setValidating(false); 047 factory.setNamespaceAware(false); 048 } 049 050 protected SAXParser parser; 051 052 protected StringBuilder builder; 053 054 protected boolean trim = false; 055 056 public Xml2TextHandler() throws SAXException, ParserConfigurationException { 057 parser = factory.newSAXParser(); 058 XMLReader reader = parser.getXMLReader(); 059 reader.setFeature("http://xml.org/sax/features/validation", false); 060 reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 061 reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); 062 } 063 064 public SAXParser getParser() { 065 return parser; 066 } 067 068 public String parse(File file) throws SAXException, IOException { 069 parser.parse(file, this); 070 String text = builder.toString(); 071 builder = null; 072 return text; 073 } 074 075 public String parse(InputStream in) throws SAXException, IOException { 076 parser.parse(in, this); 077 String text = builder.toString(); 078 builder = null; 079 return text; 080 } 081 082 public String parse(InputSource is) throws SAXException, IOException { 083 parser.parse(is, this); 084 String text = builder.toString(); 085 builder = null; 086 return text; 087 } 088 089 public String getText() { 090 return builder.toString(); 091 } 092 093 @Override 094 public void startDocument() throws SAXException { 095 trim = false; 096 builder = new StringBuilder(); 097 } 098 099 @Override 100 public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { 101 trim = true; 102 } 103 104 @Override 105 public void endElement(String uri, String localName, String name) throws SAXException { 106 trim = true; 107 } 108 109 @Override 110 public void characters(char[] ch, int start, int length) throws SAXException { 111 // sb.append(ch, start, length); if (true) return; 112 if (trim) { 113 int i = start; 114 int end = start + length; 115 while (i < end && Character.isWhitespace(ch[i])) { 116 i++; 117 } 118 builder.append(" ").append(ch, i, length - i + start); 119 trim = false; 120 // System.out.println("["+new String(ch, i, length - i + start)+"]"); 121 } else { 122 builder.append(ch, start, length); 123 // System.out.println("{"+new String(ch, start, length)+"}"); 124 } 125 } 126 127}