001/* 002 * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo - initial API and implementation 018 * 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.File; 023import java.io.FileInputStream; 024import java.io.FileOutputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.OutputStream; 028import java.io.Serializable; 029import java.util.Map; 030 031import org.apache.commons.io.IOUtils; 032import org.apache.poi.POITextExtractor; 033import org.apache.poi.extractor.ExtractorFactory; 034import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 035import org.apache.xmlbeans.XmlException; 036 037import org.nuxeo.ecm.core.api.Blob; 038import org.nuxeo.ecm.core.api.Blobs; 039import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 040import org.nuxeo.ecm.core.convert.api.ConversionException; 041import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 042import org.nuxeo.ecm.core.convert.extension.Converter; 043import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 044import org.nuxeo.runtime.api.Framework; 045 046public class MSOffice2TextConverter implements Converter { 047 048 @Override 049 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 050 051 File f = null; 052 OutputStream fas = null; 053 054 try { 055 POITextExtractor extractor = ExtractorFactory.createExtractor(blobHolder.getBlob().getStream()); 056 // TODO: find a way to distinguish headings from paragraphs using 057 // WordExtractor#getParagraphText()? 058 059 // Get extracted text with Unix end of line characters 060 String extractedText = extractor.getText().replace("\r\n", "\n"); 061 062 byte[] bytes = extractedText.getBytes("UTF-8"); 063 f = Framework.createTempFile("po-msoffice2text", ".txt"); 064 fas = new FileOutputStream(f); 065 fas.write(bytes); 066 067 try (InputStream is = new FileInputStream(f)) { 068 Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); 069 return new SimpleCachableBlobHolder(blob); 070 } 071 } catch (IOException | OpenXML4JException | XmlException e) { 072 throw new ConversionException("Error during MSOffice2Text conversion", e); 073 } finally { 074 IOUtils.closeQuietly(fas); 075 if (f != null) { 076 f.delete(); 077 } 078 } 079 } 080 081 @Override 082 public void init(ConverterDescriptor descriptor) { 083 } 084 085}