001/* 002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Nuxeo 016 * Antoine Taillefer 017 * 018 */ 019 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.ByteArrayInputStream; 023import java.io.IOException; 024import java.util.Iterator; 025import java.util.Set; 026import java.util.TreeSet; 027import java.util.zip.ZipEntry; 028import java.util.zip.ZipInputStream; 029 030import org.apache.commons.io.IOUtils; 031import org.apache.commons.logging.Log; 032import org.apache.commons.logging.LogFactory; 033import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide; 034import org.xml.sax.InputSource; 035import org.xml.sax.SAXException; 036import org.xml.sax.XMLReader; 037 038/** 039 * Pptx to text converter: parses the Open XML presentation document to read its content. 040 */ 041public class PPTX2TextConverter extends XmlZip2TextConverter { 042 043 protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class); 044 045 private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide"; 046 047 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException, 048 SAXException { 049 050 Set<PresentationSlide> slides = new TreeSet<PresentationSlide>(); 051 052 ZipEntry zipEntry = zis.getNextEntry(); 053 while (zipEntry != null) { 054 String zipEntryName = zipEntry.getName(); 055 if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX) 056 && zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) { 057 char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()); 058 int slideNumber = -1; 059 try { 060 slideNumber = Integer.parseInt(String.valueOf(slideNumberChar)); 061 } catch (NumberFormatException nfe) { 062 log.warn("Slide number is not an non integer, won't take this slide into account."); 063 } 064 if (slideNumber > -1) { 065 OpenXmlContentHandler contentHandler = new OpenXmlContentHandler(); 066 reader.setContentHandler(contentHandler); 067 reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis)))); 068 slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber)); 069 } 070 } 071 zipEntry = zis.getNextEntry(); 072 } 073 if (!slides.isEmpty()) { 074 Iterator<PresentationSlide> slidesIt = slides.iterator(); 075 while (slidesIt.hasNext()) { 076 PresentationSlide slide = slidesIt.next(); 077 sb.append(slide.getContent()); 078 sb.append("\n"); 079 } 080 } 081 } 082}