001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 * 020 */ 021 022package org.nuxeo.ecm.core.convert.plugins.text.extractors; 023 024import java.io.ByteArrayInputStream; 025import java.io.IOException; 026import java.util.Iterator; 027import java.util.Set; 028import java.util.TreeSet; 029import java.util.zip.ZipEntry; 030import java.util.zip.ZipInputStream; 031 032import org.apache.commons.io.IOUtils; 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide; 036import org.xml.sax.InputSource; 037import org.xml.sax.SAXException; 038import org.xml.sax.XMLReader; 039 040/** 041 * Pptx to text converter: parses the Open XML presentation document to read its content. 042 */ 043public class PPTX2TextConverter extends XmlZip2TextConverter { 044 045 protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class); 046 047 private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide"; 048 049 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException, 050 SAXException { 051 052 Set<PresentationSlide> slides = new TreeSet<PresentationSlide>(); 053 054 ZipEntry zipEntry = zis.getNextEntry(); 055 while (zipEntry != null) { 056 String zipEntryName = zipEntry.getName(); 057 if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX) 058 && zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) { 059 char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()); 060 int slideNumber = -1; 061 try { 062 slideNumber = Integer.parseInt(String.valueOf(slideNumberChar)); 063 } catch (NumberFormatException nfe) { 064 log.warn("Slide number is not an non integer, won't take this slide into account."); 065 } 066 if (slideNumber > -1) { 067 OpenXmlContentHandler contentHandler = new OpenXmlContentHandler(); 068 reader.setContentHandler(contentHandler); 069 reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis)))); 070 slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber)); 071 } 072 } 073 zipEntry = zis.getNextEntry(); 074 } 075 if (!slides.isEmpty()) { 076 Iterator<PresentationSlide> slidesIt = slides.iterator(); 077 while (slidesIt.hasNext()) { 078 PresentationSlide slide = slidesIt.next(); 079 sb.append(slide.getContent()); 080 sb.append("\n"); 081 } 082 } 083 } 084}