001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 */ 020 021package org.nuxeo.ecm.core.convert.plugins.text.extractors; 022 023import java.util.Stack; 024 025import org.xml.sax.Attributes; 026import org.xml.sax.SAXException; 027import org.xml.sax.helpers.DefaultHandler; 028 029public class OpenXmlContentHandler extends DefaultHandler { 030 031 protected final StringBuffer sb = new StringBuffer(); 032 033 protected final Stack<String> path = new Stack<String>(); 034 035 protected boolean dumpText = false; 036 037 public String getContent() { 038 return sb.toString(); 039 } 040 041 @Override 042 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 043 path.push(qName); 044 // Text element of a docx or pptx document 045 if (qName.equals("w:t") || qName.equals("a:t")) { 046 dumpText = true; 047 } 048 // If the paragraph's style is "styleX" with X > 1 (this is a heading, 049 // but not the document title), add a new line. 050 if (qName.equals("w:pStyle") && !"style0".equals(atts.getValue("w:val")) 051 && !"style1".equals(atts.getValue("w:val"))) { 052 sb.append("\n"); 053 054 } 055 // Paragraph: add a new line 056 if (qName.equals("w:p") || qName.equals("a:p")) { 057 sb.append("\n"); 058 } 059 } 060 061 @Override 062 public void characters(char[] ch, int start, int length) throws SAXException { 063 if (dumpText) { 064 String content = String.valueOf(ch, start, length); 065 sb.append(content); 066 } 067 } 068 069 @Override 070 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 071 path.pop(); 072 if (path.isEmpty() || !path.lastElement().equals("w:t")) { 073 dumpText = false; 074 } 075 } 076 077}