001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Antoine Taillefer 017 */ 018 019package org.nuxeo.ecm.core.convert.plugins.text.extractors; 020 021import java.util.Stack; 022 023import org.xml.sax.Attributes; 024import org.xml.sax.SAXException; 025import org.xml.sax.helpers.DefaultHandler; 026 027public class OpenXmlContentHandler extends DefaultHandler { 028 029 protected final StringBuffer sb = new StringBuffer(); 030 031 protected final Stack<String> path = new Stack<String>(); 032 033 protected boolean dumpText = false; 034 035 public String getContent() { 036 return sb.toString(); 037 } 038 039 @Override 040 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 041 path.push(qName); 042 // Text element of a docx or pptx document 043 if (qName.equals("w:t") || qName.equals("a:t")) { 044 dumpText = true; 045 } 046 // If the paragraph's style is "styleX" with X > 1 (this is a heading, 047 // but not the document title), add a new line. 048 if (qName.equals("w:pStyle") && !"style0".equals(atts.getValue("w:val")) 049 && !"style1".equals(atts.getValue("w:val"))) { 050 sb.append("\n"); 051 052 } 053 // Paragraph: add a new line 054 if (qName.equals("w:p") || qName.equals("a:p")) { 055 sb.append("\n"); 056 } 057 } 058 059 @Override 060 public void characters(char[] ch, int start, int length) throws SAXException { 061 if (dumpText) { 062 String content = String.valueOf(ch, start, length); 063 sb.append(content); 064 } 065 } 066 067 @Override 068 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 069 path.pop(); 070 if (path.isEmpty() || !path.lastElement().equals("w:t")) { 071 dumpText = false; 072 } 073 } 074 075}