001/*
002 * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo - initial API and implementation
018 *
019 * $Id$
020 */
021
022package org.nuxeo.ecm.platform.importer.source;
023
024import java.io.Serializable;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Random;
031import java.util.concurrent.atomic.AtomicInteger;
032import java.util.concurrent.atomic.AtomicLong;
033
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036import org.nuxeo.ecm.core.api.Blob;
037import org.nuxeo.ecm.core.api.Blobs;
038import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
039import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
040import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolderWithProperties;
041import org.nuxeo.ecm.platform.importer.random.DictionaryHolder;
042import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder;
043import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator;
044
045
046/**
047 * Random {@link SourceNode} to be used for load testing
048 *
049 * @author Thierry Delprat
050 */
051public class RandomTextSourceNode implements SourceNode {
052
053    private static final Log log = LogFactory.getLog(RandomTextSourceNode.class);
054
055    protected static RandomTextGenerator gen;
056
057    protected static int maxNode = 10000;
058
059    /**
060     * Used in {@link #getMaxChildren()} and {@link #getMaxFolderish()}.
061     */
062    protected static boolean nonUniformRepartition = false;
063
064    public static final int MAX_DEPTH = 8;
065
066    public static final int DEFAULT_NB_DATA_NODES_PER_FOLDER = 100;
067
068    /**
069     * Used to generate a big number of children nodes when {@link #nonUniformRepartition} is {@code true}.
070     */
071    public static final int BIG_NB_NODES_FACTOR = 50;
072
073    /**
074     * Used to generate a small number of children nodes when {@link #nonUniformRepartition} is {@code true}.
075     */
076    public static final int SMALL_NB_BODES_DIVIDER = DEFAULT_NB_DATA_NODES_PER_FOLDER;
077
078    protected static int minGlobalFolders = 0;
079
080    protected static int minFoldersPerNode = 0;
081
082    protected static AtomicInteger nbNodes;
083
084    protected static AtomicInteger nbFolders;
085
086    protected static AtomicInteger nbVisitedFolders;
087
088    protected static AtomicLong size;
089
090    protected static final Random RANDOM = new Random(); // NOSONAR (doesn't need cryptographic strength)
091
092    protected String name;
093
094    protected boolean folderish;
095
096    protected int level = 0;
097
098    protected int idx = 0;
099
100    protected static Integer blobSizeInKB;
101
102    protected List<SourceNode> cachedChildren = null;
103
104    protected static final boolean CACHE_CHILDREN = false;
105
106    protected boolean onlyText = true;
107
108    protected boolean withProperties = false;
109
110    protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order",
111            "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure",
112            "report", "internshipReport", "pressReview" };
113
114    protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture",
115            "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math",
116            "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport",
117            "technology/it" };
118
119    protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL",
120            "FreeBSD", "CC0" };
121
122    protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" };
123
124    protected static final String[] DC_SOURCE = { "internal", "external", "unknown" };
125
126    protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain",
127            "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" };
128
129    public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText, boolean withProperties) {
130        this.folderish = folderish;
131        this.level = level;
132        this.idx = idx;
133        this.onlyText = onlyText;
134        this.withProperties = withProperties;
135    }
136
137    public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText) {
138        this(folderish, level, idx, onlyText, false);
139    }
140
141    public static RandomTextSourceNode init(int maxSize) {
142        return init(maxSize, null, true);
143    }
144
145    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText) {
146        return init(maxSize, blobSizeInKB, onlyText, false, false, null);
147    }
148
149    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText, boolean nonUniform,
150                                            boolean withProperties, String lang) {
151        return init(maxSize, blobSizeInKB, onlyText, new HunspellDictionaryHolder(lang), nonUniform,
152                withProperties);
153    }
154
155    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText,
156            DictionaryHolder dictionaryHolder, boolean nonUniform, boolean withProperties) {
157        gen = new RandomTextGenerator(dictionaryHolder);
158        gen.prefilCache();
159        maxNode = maxSize;
160        nbNodes = new AtomicInteger(0);
161        nbFolders = new AtomicInteger(1);
162        nbVisitedFolders = new AtomicInteger(0);
163        size = new AtomicLong(0);
164        RandomTextSourceNode.blobSizeInKB = blobSizeInKB;
165        minGlobalFolders = maxNode / DEFAULT_NB_DATA_NODES_PER_FOLDER;
166        minFoldersPerNode = 1 + (int) Math.pow(minGlobalFolders, (1.0 / MAX_DEPTH));
167        nonUniformRepartition = nonUniform;
168        return new RandomTextSourceNode(true, 0, 0, onlyText, withProperties);
169    }
170
171    protected String getBlobMimeType() {
172        if (onlyText) {
173            return "text/plain";
174        } else {
175            return "text/partial";
176        }
177    }
178
179    private String capitalize(final String line) {
180        return Character.toUpperCase(line.charAt(0)) + line.substring(1);
181    }
182
183    @Override
184    public BlobHolder getBlobHolder() {
185        String content = null;
186        if (folderish) {
187            if (withProperties) {
188                return new SimpleBlobHolderWithProperties((Blob) null, getRandomProperties(content));
189            }
190            return null;
191        }
192        if (blobSizeInKB == null) {
193            content = gen.getRandomText();
194        } else {
195            content = gen.getRandomText(blobSizeInKB);
196        }
197        size.addAndGet(content.length());
198        Blob blob = Blobs.createBlob(content, getBlobMimeType(), null, getName() + ".txt");
199        if (withProperties) {
200            return new SimpleBlobHolderWithProperties(blob, getRandomProperties(content));
201        }
202        return new SimpleBlobHolder(blob);
203    }
204
205    protected Map<String, Serializable> getRandomProperties(String content) {
206        Map<String, Serializable> ret = new HashMap<>();
207        ret.put("dc:title", capitalize(getName()));
208        if (RANDOM.nextInt(10) == 1) {
209            String description;
210            if (content != null && ! content.isEmpty()) {
211                description = content.substring(0, content.indexOf(' ', 40));
212            } else {
213                description = gen.getRandomTitle(RANDOM.nextInt(5)+1);
214            }
215            ret.put("dc:description", capitalize(description));
216        }
217        ret.put("dc:nature", getGaussian(DC_NATURE));
218        ret.put("dc:subjects", (Serializable) Arrays.asList(getGaussian(DC_SUBJECTS)));
219        ret.put("dc:rights", getGaussian(DC_RIGHTS));
220        ret.put("dc:language", getGaussian(DC_LANGUAGE));
221        ret.put("dc:coverage", getGaussian(DC_COVERAGE));
222        ret.put("dc:source", getGaussian(DC_SOURCE));
223        // validation contraint violation
224        // ret.put("dc:creator", String.format("user%03d", hazard.nextInt(500)));
225        return ret;
226    }
227
228    protected String getGaussian(String[] words) {
229        double g = Math.abs(RANDOM.nextGaussian() / 4);
230        g = Math.min(g, 1);
231        int i = (int) Math.floor(g * (words.length - 1));
232        return words[ i ];
233    }
234
235    protected int getMidRandom(int target) {
236        return 1 + (target / 2) + RANDOM.nextInt(target);
237    }
238
239    /**
240     * Allows to get a non uniform distribution of the number of nodes per folder. Returns:
241     * <ul>
242     * <li>A small number of nodes 10% of the time, see {@link #SMALL_NB_BODES_DIVIDER}.</li>
243     * <li>A big number of nodes 10% of the time, see {@link #BIG_NB_NODES_FACTOR}.</li>
244     * <li>A random variation of the target number of nodes 80% of the time.</li>
245     * </ul>
246     */
247    protected int getNonUniform(int target, boolean folderish) {
248        int res;
249        int remainder = nbVisitedFolders.get() % 10;
250        if (remainder == 8) {
251            res = 1 + target / SMALL_NB_BODES_DIVIDER;
252            if (log.isDebugEnabled()) {
253                String nodeStr;
254                if (folderish) {
255                    nodeStr = "folderish";
256                } else {
257                    nodeStr = "data";
258                }
259                log.debug(String.format("### Small number of %s nodes: %d", nodeStr, res));
260            }
261        } else if (remainder == 9) {
262            int factor;
263            // Big number of folderish nodes is 10 times smaller than the big number of data nodes
264            if (folderish) {
265                factor = BIG_NB_NODES_FACTOR / 10;
266            } else {
267                factor = BIG_NB_NODES_FACTOR;
268            }
269            res = 1 + target * factor;
270            if (log.isDebugEnabled()) {
271                String nodeStr;
272                if (folderish) {
273                    nodeStr = "folderish";
274                } else {
275                    nodeStr = "data";
276                }
277                log.debug(String.format("### Big number of %s nodes: %d", nodeStr, res));
278            }
279        } else {
280            res = getMidRandom(target);
281        }
282        return res;
283    }
284
285    protected int getMaxChildren() {
286        if (maxNode < nbNodes.get()) {
287            return 0;
288        }
289        int targetRemainingFolders = minGlobalFolders - nbFolders.get();
290        if (targetRemainingFolders <= 0) {
291            return DEFAULT_NB_DATA_NODES_PER_FOLDER + 1;
292        }
293        int target = ((maxNode - nbNodes.get()) / targetRemainingFolders);
294        if (target <= 0) {
295            return 0;
296        }
297        if (nonUniformRepartition) {
298            return getNonUniform(target, false);
299        } else {
300            return getMidRandom(target);
301        }
302    }
303
304    protected int getMaxFolderish() {
305        if (maxNode <= nbNodes.get()) {
306            return 0;
307        }
308        if (nonUniformRepartition) {
309            return getNonUniform(minFoldersPerNode, true);
310        } else {
311            return getMidRandom(minFoldersPerNode);
312        }
313    }
314
315    @Override
316    public List<SourceNode> getChildren() {
317
318        if (!folderish) {
319            return null;
320        }
321
322        if (cachedChildren != null) {
323            return cachedChildren;
324        }
325
326        List<SourceNode> children = new ArrayList<>();
327        if (nbNodes.get() > maxNode) {
328            return children;
329        }
330
331        int nbChildren = getMaxChildren();
332        for (int i = 0; i < nbChildren; i++) {
333            children.add(new RandomTextSourceNode(false, level, i, onlyText, withProperties));
334        }
335        nbNodes.addAndGet(nbChildren);
336        if (log.isDebugEnabled()) {
337            String nodeStr;
338            if (nbChildren > 1) {
339                nodeStr = "nodes";
340            } else {
341                nodeStr = "node";
342            }
343            log.debug(String.format("Added %s data %s to %s; data node total count = %s", nbChildren, nodeStr,
344                    getName(), nbNodes));
345        }
346
347        if (level < MAX_DEPTH) {
348            // In the case of a non uniform repartition, don't add folderish nodes if there are no data nodes to not
349            // overload the tree with folderish nodes that would probably be empty
350            if (!nonUniformRepartition || nbChildren > 0) {
351                int nbFolderish = getMaxFolderish();
352                for (int i = 0; i < nbFolderish; i++) {
353                    children.add(new RandomTextSourceNode(true, level + 1, i, onlyText, withProperties));
354                }
355                nbFolders.addAndGet(nbFolderish);
356                if (log.isDebugEnabled()) {
357                    String nodeStr;
358                    if (nbFolderish > 1) {
359                        nodeStr = "nodes";
360                    } else {
361                        nodeStr = "node";
362                    }
363                    log.debug(String.format("Added %s folderish %s to %s; folderish node total count = %s",
364                            nbFolderish, nodeStr, getName(), nbFolders));
365                }
366            }
367        }
368        if (CACHE_CHILDREN) {
369            cachedChildren = children;
370        }
371
372        nbVisitedFolders.incrementAndGet();
373        if (log.isDebugEnabled()) {
374            String folderStr;
375            if (nbVisitedFolders.get() > 1) {
376                folderStr = "folders";
377            } else {
378                folderStr = "folder";
379            }
380            log.debug(String.format("Visited %s %s", nbVisitedFolders, folderStr));
381        }
382
383        return children;
384    }
385
386    @Override
387    public String getName() {
388        if (name == null) {
389            if (withProperties) {
390                name = gen.getRandomTitle(RANDOM.nextInt(3)+1);
391            }
392            else {
393                if (folderish) {
394                    name = "folder";
395                } else {
396                    name = "file";
397                }
398                if (level == 0 && folderish) {
399                    name = name + "-" + (System.currentTimeMillis() % 10000) + RANDOM.nextInt(100);
400                } else {
401                    name = name + "-" + level + "-" + idx;
402                }
403            }
404        }
405        return name;
406    }
407
408    @Override
409    public boolean isFolderish() {
410        return folderish;
411    }
412
413    public static Integer getNbNodes() {
414        return nbNodes.get();
415    }
416
417    public static Long getSize() {
418        return size.get();
419    }
420
421    public int getLevel() {
422        return level;
423    }
424
425    @Override
426    public String getSourcePath() {
427        return null;
428    }
429}