001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Antoine Taillefer 017 */ 018 019package org.nuxeo.ecm.core.convert.plugins.text.extractors; 020 021import java.util.Stack; 022 023import org.apache.commons.lang.StringUtils; 024import org.apache.commons.logging.Log; 025import org.apache.commons.logging.LogFactory; 026import org.xml.sax.Attributes; 027import org.xml.sax.SAXException; 028import org.xml.sax.helpers.DefaultHandler; 029 030public class OOoXmlContentHandler extends DefaultHandler { 031 032 protected static final Log log = LogFactory.getLog(OOoXmlContentHandler.class); 033 034 protected final StringBuffer sb = new StringBuffer(); 035 036 protected final Stack<String> path = new Stack<String>(); 037 038 protected boolean dumpText = false; 039 040 protected boolean isSpreadSheet = false; 041 042 public String getContent() { 043 return sb.toString(); 044 } 045 046 @Override 047 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 048 path.push(qName); 049 050 // Detect spreadsheet 051 if (qName.equals("office:spreadsheet")) { 052 isSpreadSheet = true; 053 } 054 055 // Text element 056 if (qName.startsWith("text:")) { 057 dumpText = true; 058 } 059 // Heading (Writer only): add a new line. 060 // If the heading's outline level is > 1 (not the document title), add 061 // an extra new line. 062 if (qName.equals("text:h")) { 063 sb.append("\n"); 064 String outlineLevelAtt = atts.getValue("text:outline-level"); 065 if (!StringUtils.isEmpty(outlineLevelAtt)) { 066 int outlineLevel = -1; 067 try { 068 outlineLevel = Integer.parseInt(outlineLevelAtt); 069 } catch (NumberFormatException nfe) { 070 log.warn("Attribute 'text:outline-level' on element 'text:h' has a non integer value."); 071 } 072 if (outlineLevel > 1) { 073 sb.append("\n"); 074 } 075 } 076 } 077 // Paragraph: add a new line 078 if (!isSpreadSheet && qName.equals("text:p")) { 079 sb.append("\n"); 080 } 081 082 // Page (Impress only): add a new line if not the first one 083 if (qName.equals("draw:page") && !"page1".equals(atts.getValue("draw:name"))) { 084 sb.append("\n"); 085 } 086 } 087 088 @Override 089 public void characters(char[] ch, int start, int length) throws SAXException { 090 if (dumpText) { 091 String content = String.valueOf(ch, start, length); 092 sb.append(content); 093 } 094 } 095 096 @Override 097 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 098 path.pop(); 099 if (path.isEmpty() || !path.lastElement().startsWith("text:")) { 100 dumpText = false; 101 } 102 103 // Specific separators for spreadsheets: 104 if (isSpreadSheet) { 105 106 // End of table row: add a blank line 107 if (qName.equals("table:table-row")) { 108 sb.append("\n\n"); 109 } 110 111 // End of table cell: add a separator 112 if (qName.equals("table:table-cell")) { 113 sb.append(" "); 114 } 115 116 // End of paragraph: add a white space 117 if (qName.equals("text:p")) { 118 sb.append(" "); 119 } 120 } 121 } 122 123}