001/*
002 * (C) Copyright 2016-2020 Nuxeo (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.directory;
020
021import static java.nio.charset.StandardCharsets.UTF_8;
022
023import java.io.IOException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.net.URL;
027import java.net.URLConnection;
028import java.sql.Timestamp;
029import java.util.ArrayList;
030import java.util.Calendar;
031import java.util.GregorianCalendar;
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.function.BiConsumer;
036import java.util.function.Consumer;
037
038import org.apache.commons.csv.CSVFormat;
039import org.apache.commons.csv.CSVParser;
040import org.apache.commons.csv.CSVRecord;
041import org.apache.commons.lang3.StringUtils;
042import org.apache.logging.log4j.LogManager;
043import org.apache.logging.log4j.Logger;
044import org.nuxeo.ecm.core.api.Blob;
045import org.nuxeo.ecm.core.api.Blobs;
046import org.nuxeo.ecm.core.schema.types.Field;
047import org.nuxeo.ecm.core.schema.types.Schema;
048import org.nuxeo.ecm.core.schema.types.Type;
049import org.nuxeo.ecm.core.schema.types.primitives.DateType;
050import org.nuxeo.runtime.api.Framework;
051
052/**
053 * Helper to load data from a CSV file.
054 * <p>
055 * The actual consumer of rows is a parameter passed by the caller.
056 *
057 * @since 8.4
058 */
059public class DirectoryCSVLoader {
060
061    private static final Logger log = LogManager.getLogger(DirectoryCSVLoader.class);
062
063    /**
064     * The special CSV value ({@value}) used to denote that a {@code null} should be used for a value.
065     */
066    public static final String CSV_NULL_MARKER = "__NULL__";
067
068    private DirectoryCSVLoader() {
069    }
070
071    /**
072     * Loads the CSV data file based on the provided schema, and creates the corresponding entries using the provided
073     * loader.
074     *
075     * @param dataFileName the file name containing CSV data
076     * @param delimiter the CSV column separator
077     * @param schema the data schema
078     * @param loader the actual consumer of loaded rows
079     * @since 8.4
080     * @see #loadData(Blob, char, Schema, Consumer)
081     */
082    public static void loadData(String dataFileName, char delimiter, Schema schema,
083            Consumer<Map<String, Object>> loader) {
084        Blob blob = createBlob(dataFileName);
085        loadData(blob, delimiter, schema, loader);
086    }
087
088    protected static Blob createBlob(String dataFileName) {
089        try (InputStream in = getResource(dataFileName)) {
090            return Blobs.createBlob(in, "text/csv");
091        } catch (IOException e) {
092            throw new DirectoryException("Read error while creating blob from data file: " + dataFileName, e);
093        }
094    }
095
096    /**
097     * Loads the CSV data file based on the provided schema, and creates the corresponding entries using the provided
098     * loader.
099     *
100     * @param dataBlob the blob containing CSV data
101     * @param delimiter the CSV column separator
102     * @param schema the data schema
103     * @param loader the actual consumer of loaded rows
104     * @since 11.1
105     */
106    public static void loadData(Blob dataBlob, char delimiter, Schema schema, Consumer<Map<String, Object>> loader) {
107        BiConsumer<Map<String, Object>, Integer> loaderWithLineno = toLoaderEnrichedOnError(loader);
108        String dataFileName = dataBlob.getFilename();
109        try (InputStream in = dataBlob.getStream();
110                CSVParser csvParser = new CSVParser(new InputStreamReader(in, UTF_8),
111                        CSVFormat.DEFAULT.withDelimiter(delimiter).withHeader())) {
112
113            Map<String, Integer> header = csvParser.getHeaderMap();
114
115            List<Field> fields = new ArrayList<>();
116            for (String columnName : header.keySet()) {
117                Field field = schema.getField(columnName.trim());
118                if (field == null) {
119                    throw new DirectoryException("Column not found: " + columnName + " in schema: " + schema.getName());
120                }
121                fields.add(field);
122            }
123
124            int lineno = 1; // header was first line
125            for (CSVRecord record : csvParser) {
126                lineno++;
127                if (record.size() == 0 || record.size() == 1 && StringUtils.isBlank(record.get(0))) {
128                    // NXP-2538: allow columns with only one value but skip empty lines
129                    continue;
130                }
131                if (!record.isConsistent()) {
132                    log.error("Invalid column count while reading CSV file: {}, line: {}", dataFileName, lineno);
133                    continue;
134                }
135
136                Map<String, Object> map = new HashMap<>();
137                for (int i = 0; i < header.size(); i++) {
138                    Field field = fields.get(i);
139                    String value = record.get(i);
140                    Object v = CSV_NULL_MARKER.equals(value) ? null : decode(field, value);
141                    map.put(field.getName().getPrefixedName(), v);
142                }
143                loaderWithLineno.accept(map, lineno);
144            }
145        } catch (IOException e) {
146            throw new DirectoryException("Read error while reading data file: " + dataFileName, e);
147        }
148    }
149
150    protected static BiConsumer<Map<String, Object>, Integer> toLoaderEnrichedOnError(
151            Consumer<Map<String, Object>> loader) {
152        return (map, lineno) -> {
153            try {
154                loader.accept(map);
155            } catch (DirectoryException e) {
156                e.addInfo("At line: " + lineno);
157                throw e;
158            }
159        };
160    }
161
162    protected static Object decode(Field field, String value) {
163        Type type = field.getType();
164        if (type instanceof DateType) {
165            // compat with earlier code, interpret in the local timezone and not UTC
166            Calendar cal = new GregorianCalendar();
167            cal.setTime(Timestamp.valueOf(value));
168            return cal;
169        } else {
170            return type.decode(value);
171        }
172    }
173
174    protected static InputStream getResource(String name) {
175        InputStream in = open(DirectoryCSVLoader.class.getClassLoader().getResource(name));
176        if (in == null) {
177            in = open(Framework.getResourceLoader().getResource(name));
178            if (in == null) {
179                throw new DirectoryException("Data file not found: " + name);
180            }
181        }
182        return in;
183    }
184
185    /**
186     * Gets the {@link InputStream} from a {@link URL}, avoiding JAR caches.
187     *
188     * @since 11.1
189     */
190    protected static InputStream open(URL url) {
191        if (url == null) {
192            return null;
193        }
194        URLConnection con;
195        try {
196            con = url.openConnection();
197        } catch (IOException e) {
198            return null;
199        }
200        // avoid using caches, as hot-reload may change underlying JARs
201        con.setUseCaches(false);
202        try {
203            return con.getInputStream();
204        } catch (IOException e) {
205            return null;
206        }
207    }
208
209}