001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 */ 020 021package org.nuxeo.ecm.core.convert.plugins.text.extractors; 022 023import java.util.Stack; 024 025import org.apache.commons.lang.StringUtils; 026import org.apache.commons.logging.Log; 027import org.apache.commons.logging.LogFactory; 028import org.xml.sax.Attributes; 029import org.xml.sax.SAXException; 030import org.xml.sax.helpers.DefaultHandler; 031 032public class OOoXmlContentHandler extends DefaultHandler { 033 034 protected static final Log log = LogFactory.getLog(OOoXmlContentHandler.class); 035 036 protected final StringBuffer sb = new StringBuffer(); 037 038 protected final Stack<String> path = new Stack<String>(); 039 040 protected boolean dumpText = false; 041 042 protected boolean isSpreadSheet = false; 043 044 public String getContent() { 045 return sb.toString(); 046 } 047 048 @Override 049 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 050 path.push(qName); 051 052 // Detect spreadsheet 053 if (qName.equals("office:spreadsheet")) { 054 isSpreadSheet = true; 055 } 056 057 // Text element 058 if (qName.startsWith("text:")) { 059 dumpText = true; 060 } 061 // Heading (Writer only): add a new line. 062 // If the heading's outline level is > 1 (not the document title), add 063 // an extra new line. 064 if (qName.equals("text:h")) { 065 sb.append("\n"); 066 String outlineLevelAtt = atts.getValue("text:outline-level"); 067 if (!StringUtils.isEmpty(outlineLevelAtt)) { 068 int outlineLevel = -1; 069 try { 070 outlineLevel = Integer.parseInt(outlineLevelAtt); 071 } catch (NumberFormatException nfe) { 072 log.warn("Attribute 'text:outline-level' on element 'text:h' has a non integer value."); 073 } 074 if (outlineLevel > 1) { 075 sb.append("\n"); 076 } 077 } 078 } 079 // Paragraph: add a new line 080 if (!isSpreadSheet && qName.equals("text:p")) { 081 sb.append("\n"); 082 } 083 084 // Page (Impress only): add a new line if not the first one 085 if (qName.equals("draw:page") && !"page1".equals(atts.getValue("draw:name"))) { 086 sb.append("\n"); 087 } 088 } 089 090 @Override 091 public void characters(char[] ch, int start, int length) throws SAXException { 092 if (dumpText) { 093 String content = String.valueOf(ch, start, length); 094 sb.append(content); 095 } 096 } 097 098 @Override 099 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 100 path.pop(); 101 if (path.isEmpty() || !path.lastElement().startsWith("text:")) { 102 dumpText = false; 103 } 104 105 // Specific separators for spreadsheets: 106 if (isSpreadSheet) { 107 108 // End of table row: add a blank line 109 if (qName.equals("table:table-row")) { 110 sb.append("\n\n"); 111 } 112 113 // End of table cell: add a separator 114 if (qName.equals("table:table-cell")) { 115 sb.append(" "); 116 } 117 118 // End of paragraph: add a white space 119 if (qName.equals("text:p")) { 120 sb.append(" "); 121 } 122 } 123 } 124 125}