001/*
002 * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo - initial API and implementation
018 *
019 * $Id$
020 */
021
022package org.nuxeo.ecm.platform.importer.source;
023
024import java.io.Serializable;
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Random;
031import java.util.concurrent.atomic.AtomicInteger;
032import java.util.concurrent.atomic.AtomicLong;
033
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036import org.nuxeo.ecm.core.api.Blob;
037import org.nuxeo.ecm.core.api.Blobs;
038import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
039import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
040import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolderWithProperties;
041import org.nuxeo.ecm.platform.importer.random.DictionaryHolder;
042import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder;
043import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator;
044
045
046/**
047 * Random {@link SourceNode} to be used for load testing
048 *
049 * @author Thierry Delprat
050 */
051public class RandomTextSourceNode implements SourceNode {
052
053    private static final Log log = LogFactory.getLog(RandomTextSourceNode.class);
054
055    protected static RandomTextGenerator gen;
056
057    protected static int maxNode = 10000;
058
059    /**
060     * Used in {@link #getMaxChildren()} and {@link #getMaxFolderish()}.
061     */
062    protected static boolean nonUniformRepartition = false;
063
064    public static final int MAX_DEPTH = 8;
065
066    public static final int DEFAULT_NB_DATA_NODES_PER_FOLDER = 100;
067
068    /**
069     * Used to generate a big number of children nodes when {@link #nonUniformRepartition} is {@code true}.
070     */
071    public static final int BIG_NB_NODES_FACTOR = 50;
072
073    /**
074     * Used to generate a small number of children nodes when {@link #nonUniformRepartition} is {@code true}.
075     */
076    public static final int SMALL_NB_BODES_DIVIDER = DEFAULT_NB_DATA_NODES_PER_FOLDER;
077
078    protected static int minGlobalFolders = 0;
079
080    protected static int minFoldersPerNode = 0;
081
082    protected static AtomicInteger nbNodes;
083
084    protected static AtomicInteger nbFolders;
085
086    protected static AtomicInteger nbVisitedFolders;
087
088    protected static AtomicLong size;
089
090    protected Random hazard;
091
092    protected String name;
093
094    protected boolean folderish;
095
096    protected int level = 0;
097
098    protected int idx = 0;
099
100    protected static Integer blobSizeInKB;
101
102    protected List<SourceNode> cachedChildren = null;
103
104    protected static final boolean CACHE_CHILDREN = false;
105
106    protected boolean onlyText = true;
107
108    protected boolean withProperties = false;
109
110    protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order",
111            "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure",
112            "report", "internshipReport", "pressReview" };
113
114    protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture",
115            "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math",
116            "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport",
117            "technology/it" };
118
119    protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL",
120            "FreeBSD", "CC0" };
121
122    protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" };
123
124    protected static final String[] DC_SOURCE = { "internal", "external", "unknown" };
125
126    protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain",
127            "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" };
128
129    public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText, boolean withProperties) {
130        this.folderish = folderish;
131        hazard = new Random();
132        this.level = level;
133        this.idx = idx;
134        this.onlyText = onlyText;
135        this.withProperties = withProperties;
136    }
137
138    public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText) {
139        this(folderish, level, idx, onlyText, false);
140    }
141
142    public static RandomTextSourceNode init(int maxSize) {
143        return init(maxSize, null, true);
144    }
145
146    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText) {
147        return init(maxSize, blobSizeInKB, onlyText, false, false, null);
148    }
149
150    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText, boolean nonUniform,
151                                            boolean withProperties, String lang) {
152        return init(maxSize, blobSizeInKB, onlyText, new HunspellDictionaryHolder(lang), nonUniform,
153                withProperties);
154    }
155
156    public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText,
157            DictionaryHolder dictionaryHolder, boolean nonUniform, boolean withProperties) {
158        gen = new RandomTextGenerator(dictionaryHolder);
159        gen.prefilCache();
160        maxNode = maxSize;
161        nbNodes = new AtomicInteger(0);
162        nbFolders = new AtomicInteger(1);
163        nbVisitedFolders = new AtomicInteger(0);
164        size = new AtomicLong(0);
165        RandomTextSourceNode.blobSizeInKB = blobSizeInKB;
166        minGlobalFolders = maxNode / DEFAULT_NB_DATA_NODES_PER_FOLDER;
167        minFoldersPerNode = 1 + (int) Math.pow(minGlobalFolders, (1.0 / MAX_DEPTH));
168        nonUniformRepartition = nonUniform;
169        return new RandomTextSourceNode(true, 0, 0, onlyText, withProperties);
170    }
171
172    protected String getBlobMimeType() {
173        if (onlyText) {
174            return "text/plain";
175        } else {
176            return "text/partial";
177        }
178    }
179
180    private String capitalize(final String line) {
181        return Character.toUpperCase(line.charAt(0)) + line.substring(1);
182    }
183
184    @Override
185    public BlobHolder getBlobHolder() {
186        String content = null;
187        if (folderish) {
188            if (withProperties) {
189                return new SimpleBlobHolderWithProperties((Blob) null, getRandomProperties(content));
190            }
191            return null;
192        }
193        if (blobSizeInKB == null) {
194            content = gen.getRandomText();
195        } else {
196            content = gen.getRandomText(blobSizeInKB);
197        }
198        size.addAndGet(content.length());
199        Blob blob = Blobs.createBlob(content, getBlobMimeType(), null, getName() + ".txt");
200        if (withProperties) {
201            return new SimpleBlobHolderWithProperties(blob, getRandomProperties(content));
202        }
203        return new SimpleBlobHolder(blob);
204    }
205
206    protected Map<String, Serializable> getRandomProperties(String content) {
207        Map<String, Serializable> ret = new HashMap<>();
208        ret.put("dc:title", capitalize(getName()));
209        if (hazard.nextInt(10) == 1) {
210            String description;
211            if (content != null && ! content.isEmpty()) {
212                description = content.substring(0, content.indexOf(' ', 40));
213            } else {
214                description = gen.getRandomTitle(hazard.nextInt(5)+1);
215            }
216            ret.put("dc:description", capitalize(description));
217        }
218        ret.put("dc:nature", getGaussian(DC_NATURE));
219        ret.put("dc:subjects", (Serializable) Arrays.asList(getGaussian(DC_SUBJECTS)));
220        ret.put("dc:rights", getGaussian(DC_RIGHTS));
221        ret.put("dc:language", getGaussian(DC_LANGUAGE));
222        ret.put("dc:coverage", getGaussian(DC_COVERAGE));
223        ret.put("dc:source", getGaussian(DC_SOURCE));
224        // validation contraint violation
225        // ret.put("dc:creator", String.format("user%03d", hazard.nextInt(500)));
226        return ret;
227    }
228
229    protected String getGaussian(String[] words) {
230        double g = Math.abs(hazard.nextGaussian() / 4);
231        g = Math.min(g, 1);
232        int i = (int) Math.floor(g * (words.length - 1));
233        return words[ i ];
234    }
235
236    protected int getMidRandom(int target) {
237        return 1 + (target / 2) + hazard.nextInt(target);
238    }
239
240    /**
241     * Allows to get a non uniform distribution of the number of nodes per folder. Returns:
242     * <ul>
243     * <li>A small number of nodes 10% of the time, see {@link #SMALL_NB_BODES_DIVIDER}.</li>
244     * <li>A big number of nodes 10% of the time, see {@link #BIG_NB_NODES_FACTOR}.</li>
245     * <li>A random variation of the target number of nodes 80% of the time.</li>
246     * </ul>
247     */
248    protected int getNonUniform(int target, boolean folderish) {
249        int res;
250        int remainder = nbVisitedFolders.get() % 10;
251        if (remainder == 8) {
252            res = 1 + target / SMALL_NB_BODES_DIVIDER;
253            if (log.isDebugEnabled()) {
254                String nodeStr;
255                if (folderish) {
256                    nodeStr = "folderish";
257                } else {
258                    nodeStr = "data";
259                }
260                log.debug(String.format("### Small number of %s nodes: %d", nodeStr, res));
261            }
262        } else if (remainder == 9) {
263            int factor;
264            // Big number of folderish nodes is 10 times smaller than the big number of data nodes
265            if (folderish) {
266                factor = BIG_NB_NODES_FACTOR / 10;
267            } else {
268                factor = BIG_NB_NODES_FACTOR;
269            }
270            res = 1 + target * factor;
271            if (log.isDebugEnabled()) {
272                String nodeStr;
273                if (folderish) {
274                    nodeStr = "folderish";
275                } else {
276                    nodeStr = "data";
277                }
278                log.debug(String.format("### Big number of %s nodes: %d", nodeStr, res));
279            }
280        } else {
281            res = getMidRandom(target);
282        }
283        return res;
284    }
285
286    protected int getMaxChildren() {
287        if (maxNode < nbNodes.get()) {
288            return 0;
289        }
290        int targetRemainingFolders = minGlobalFolders - nbFolders.get();
291        if (targetRemainingFolders <= 0) {
292            return DEFAULT_NB_DATA_NODES_PER_FOLDER + 1;
293        }
294        int target = ((maxNode - nbNodes.get()) / targetRemainingFolders);
295        if (target <= 0) {
296            return 0;
297        }
298        if (nonUniformRepartition) {
299            return getNonUniform(target, false);
300        } else {
301            return getMidRandom(target);
302        }
303    }
304
305    protected int getMaxFolderish() {
306        if (maxNode <= nbNodes.get()) {
307            return 0;
308        }
309        if (nonUniformRepartition) {
310            return getNonUniform(minFoldersPerNode, true);
311        } else {
312            return getMidRandom(minFoldersPerNode);
313        }
314    }
315
316    @Override
317    public List<SourceNode> getChildren() {
318
319        if (!folderish) {
320            return null;
321        }
322
323        if (cachedChildren != null) {
324            return cachedChildren;
325        }
326
327        List<SourceNode> children = new ArrayList<SourceNode>();
328        if (nbNodes.get() > maxNode) {
329            return children;
330        }
331
332        int nbChildren = getMaxChildren();
333        for (int i = 0; i < nbChildren; i++) {
334            children.add(new RandomTextSourceNode(false, level, i, onlyText, withProperties));
335        }
336        nbNodes.addAndGet(nbChildren);
337        if (log.isDebugEnabled()) {
338            String nodeStr;
339            if (nbChildren > 1) {
340                nodeStr = "nodes";
341            } else {
342                nodeStr = "node";
343            }
344            log.debug(String.format("Added %s data %s to %s; data node total count = %s", nbChildren, nodeStr,
345                    getName(), nbNodes));
346        }
347
348        if (level < MAX_DEPTH) {
349            // In the case of a non uniform repartition, don't add folderish nodes if there are no data nodes to not
350            // overload the tree with folderish nodes that would probably be empty
351            if (!nonUniformRepartition || nbChildren > 0) {
352                int nbFolderish = getMaxFolderish();
353                for (int i = 0; i < nbFolderish; i++) {
354                    children.add(new RandomTextSourceNode(true, level + 1, i, onlyText, withProperties));
355                }
356                nbFolders.addAndGet(nbFolderish);
357                if (log.isDebugEnabled()) {
358                    String nodeStr;
359                    if (nbFolderish > 1) {
360                        nodeStr = "nodes";
361                    } else {
362                        nodeStr = "node";
363                    }
364                    log.debug(String.format("Added %s folderish %s to %s; folderish node total count = %s",
365                            nbFolderish, nodeStr, getName(), nbFolders));
366                }
367            }
368        }
369        if (CACHE_CHILDREN) {
370            cachedChildren = children;
371        }
372
373        nbVisitedFolders.incrementAndGet();
374        if (log.isDebugEnabled()) {
375            String folderStr;
376            if (nbVisitedFolders.get() > 1) {
377                folderStr = "folders";
378            } else {
379                folderStr = "folder";
380            }
381            log.debug(String.format("Visited %s %s", nbVisitedFolders, folderStr));
382        }
383
384        return children;
385    }
386
387    @Override
388    public String getName() {
389        if (name == null) {
390            if (withProperties) {
391                name = gen.getRandomTitle(hazard.nextInt(3)+1);
392            }
393            else {
394                if (folderish) {
395                    name = "folder";
396                } else {
397                    name = "file";
398                }
399                if (level == 0 && folderish) {
400                    name = name + "-" + (System.currentTimeMillis() % 10000) + hazard.nextInt(100);
401                } else {
402                    name = name + "-" + level + "-" + idx;
403                }
404            }
405        }
406        return name;
407    }
408
409    @Override
410    public boolean isFolderish() {
411        return folderish;
412    }
413
414    public static Integer getNbNodes() {
415        return nbNodes.get();
416    }
417
418    public static Long getSize() {
419        return size.get();
420    }
421
422    public int getLevel() {
423        return level;
424    }
425
426    @Override
427    public String getSourcePath() {
428        return null;
429    }
430}