001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017package org.nuxeo.importer.stream.producer;
018
019import java.io.Serializable;
020import java.util.ArrayList;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Random;
026import java.util.Set;
027import java.util.concurrent.ThreadLocalRandom;
028
029import org.apache.commons.logging.Log;
030import org.apache.commons.logging.LogFactory;
031import org.nuxeo.ecm.core.api.Blob;
032import org.nuxeo.ecm.core.api.Blobs;
033import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder;
034import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator;
035import org.nuxeo.importer.stream.message.DocumentMessage;
036import org.nuxeo.lib.stream.pattern.producer.AbstractProducer;
037
038/**
039 * @since 9.1
040 */
041public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> {
042    private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class);
043
044    protected final long nbDocuments;
045
046    protected final BlobInfoFetcher blobInfoFetcher;
047
048    protected boolean countFolderAsDocument = true;
049
050    protected int maxFoldersPerFolder = 50;
051
052    protected int maxDocumentsPerFolder = 500;
053
054    protected int blobSizeKB = 0;
055
056    protected boolean blobOnlyText = false;
057
058    protected int documentCount = 0;
059
060    protected int folderCount = 0;
061
062    protected final Random rand;
063
064    protected static RandomTextGenerator gen;
065
066    static protected final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order",
067            "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure",
068            "report", "internshipReport", "pressReview" };
069
070    static protected final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture",
071            "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math",
072            "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport",
073            "technology/it" };
074
075    static protected final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL",
076            "FreeBSD", "CC0" };
077
078    static protected final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" };
079
080    static protected final String[] DC_SOURCE = { "internal", "external", "unknown" };
081
082    static protected final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain",
083            "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" };
084
085    protected int foldersInCurrentFolderLimit;
086
087    protected int documentInCurrentFolderLimit;
088
089    protected enum DocType {
090        Root, Folder, Document
091    }
092
093    protected DocType currentType = DocType.Root;
094
095    protected int parentIndex = 0;
096
097    protected List<String> parents = new ArrayList<>();
098
099    protected List<String> folderishChildren = new ArrayList<>();
100
101    protected Set<String> children = new HashSet<>();
102
103    protected int documentInCurrentFolderCount = 0;
104
105    public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang,
106            BlobInfoFetcher blobInfoFetcher) {
107        super(producerId);
108        this.nbDocuments = nbDocuments;
109        rand = ThreadLocalRandom.current();
110
111        synchronized (RandomDocumentMessageProducer.class) {
112            if (gen == null) {
113                gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang));
114                gen.prefilCache();
115            }
116        }
117        this.blobInfoFetcher = blobInfoFetcher;
118        log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments);
119    }
120
121    public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) {
122        maxFoldersPerFolder = max;
123        return this;
124    }
125
126    public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) {
127        maxDocumentsPerFolder = max;
128        return this;
129    }
130
131    public RandomDocumentMessageProducer countFolderAsDocument(boolean value) {
132        countFolderAsDocument = value;
133        return this;
134    }
135
136    public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) {
137        this.blobSizeKB = sizeKB;
138        this.blobOnlyText = onlyText;
139        return this;
140    }
141
142    @Override
143    public int getPartition(DocumentMessage message, int partitions) {
144        return getProducerId() % partitions;
145    }
146
147    @Override
148    public boolean hasNext() {
149        if (countFolderAsDocument) {
150            return (documentCount + folderCount) < nbDocuments;
151        }
152        return documentCount <= nbDocuments;
153    }
154
155    @Override
156    public DocumentMessage next() {
157        DocumentMessage ret;
158        switch (currentType) {
159        case Root:
160            ret = createRoot();
161            parents.add(ret.getId());
162            currentType = DocType.Folder;
163            foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
164            break;
165        case Folder:
166            ret = createFolder(parents.get(parentIndex), children);
167            folderishChildren.add(ret.getId());
168            children.add(ret.getName());
169            if (folderishChildren.size() >= foldersInCurrentFolderLimit) {
170                currentType = DocType.Document;
171                documentInCurrentFolderCount = 0;
172                documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder);
173            }
174            break;
175        default:
176        case Document:
177            ret = createDocument(parents.get(parentIndex), children);
178            children.add(ret.getName());
179            documentInCurrentFolderCount += 1;
180            if (documentInCurrentFolderCount > documentInCurrentFolderLimit) {
181                parentIndex += 1;
182                if (parentIndex >= parents.size()) {
183                    parents.clear();
184                    parents = folderishChildren;
185                    folderishChildren = new ArrayList<>();
186                    children = new HashSet<>();
187                    parentIndex = 0;
188                }
189                currentType = DocType.Folder;
190                foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
191            }
192            break;
193        }
194        // log.debug(ret.getType() + ": " + ret.getId());
195        return ret;
196    }
197
198    protected DocumentMessage createRoot() {
199        folderCount++;
200        return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", "");
201    }
202
203    protected DocumentMessage createFolder(String parentPath, Set<String> exclude) {
204        DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude);
205        folderCount++;
206        return node;
207    }
208
209    protected DocumentMessage createDocument(String parentPath, Set<String> exclude) {
210        DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude);
211        documentCount++;
212        return node;
213    }
214
215    protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob,
216            Set<String> exclude) {
217        DocumentMessage node = getRandomNode(type, parentPath, withBlob);
218        String name = node.getName();
219        if (exclude.contains(name)) {
220            String newName = name + "-" + rand.nextInt(exclude.size());
221            node = DocumentMessage.copy(node, newName);
222        }
223        return node;
224    }
225
226    protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) {
227        String title = getTitle();
228        String name = getName(title);
229        HashMap<String, Serializable> props = getRandomProperties(title);
230        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
231        if (withBlob) {
232            if (blobInfoFetcher != null) {
233                builder.setBlobInfo(blobInfoFetcher.get(builder));
234            } else {
235                builder.setBlob(getRandomBlob());
236            }
237        }
238        return builder.build();
239    }
240
241    protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) {
242        String title = getTitle();
243        String name = prefix + getName(title);
244        HashMap<String, Serializable> props = getRandomProperties(title);
245        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
246        if (blobInfoFetcher != null) {
247            builder.setBlobInfo(blobInfoFetcher.get(builder));
248        } else {
249            builder.setBlob(getRandomBlob());
250        }
251        return builder.build();
252    }
253
254    protected Blob getRandomBlob() {
255        if (blobSizeKB == 0) {
256            return null;
257        }
258        String content = gen.getRandomText(blobSizeKB);
259        return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt");
260    }
261
262    protected String getBlobMimeType() {
263        if (blobOnlyText) {
264            return "text/plain";
265        } else {
266            return "text/partial";
267        }
268    }
269
270    protected String getName(String title) {
271        return title.replaceAll("\\W+", "-").toLowerCase();
272    }
273
274    protected String getTitle() {
275        return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim());
276        // return "f" + folderCount;
277    }
278
279    protected String capitalize(final String line) {
280        return Character.toUpperCase(line.charAt(0)) + line.substring(1);
281    }
282
283    protected HashMap<String, Serializable> getRandomProperties(String title) {
284        HashMap<String, Serializable> ret = new HashMap<>();
285        ret.put("dc:title", title);
286        if (rand.nextInt(10) == 1) {
287            String description = gen.getRandomTitle(rand.nextInt(5) + 1);
288            ret.put("dc:description", capitalize(description));
289        }
290        ret.put("dc:nature", getGaussian(DC_NATURE));
291        ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS)));
292        ret.put("dc:rights", getGaussian(DC_RIGHTS));
293        ret.put("dc:language", getGaussian(DC_LANGUAGE));
294        ret.put("dc:coverage", getGaussian(DC_COVERAGE));
295        ret.put("dc:source", getGaussian(DC_SOURCE));
296        return ret;
297    }
298
299    protected String getGaussian(String[] words) {
300        double g = Math.abs(rand.nextGaussian() / 4);
301        g = Math.min(g, 1);
302        int i = (int) Math.floor(g * (words.length - 1));
303        return words[i];
304    }
305
306    @Override
307    public void close() throws Exception {
308        super.close();
309        if (blobInfoFetcher != null) {
310            blobInfoFetcher.close();
311        }
312    }
313
314}