001/* 002 * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo - initial API and implementation 018 * 019 * $Id$ 020 */ 021 022package org.nuxeo.ecm.platform.importer.source; 023 024import java.io.Serializable; 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.HashMap; 028import java.util.List; 029import java.util.Map; 030import java.util.Random; 031import java.util.concurrent.atomic.AtomicInteger; 032import java.util.concurrent.atomic.AtomicLong; 033 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036import org.nuxeo.ecm.core.api.Blob; 037import org.nuxeo.ecm.core.api.Blobs; 038import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 039import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; 040import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolderWithProperties; 041import org.nuxeo.ecm.platform.importer.random.DictionaryHolder; 042import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder; 043import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator; 044 045 046/** 047 * Random {@link SourceNode} to be used for load testing 048 * 049 * @author Thierry Delprat 050 */ 051public class RandomTextSourceNode implements SourceNode { 052 053 private static final Log log = LogFactory.getLog(RandomTextSourceNode.class); 054 055 protected static RandomTextGenerator gen; 056 057 protected static int maxNode = 10000; 058 059 /** 060 * Used in {@link #getMaxChildren()} and {@link #getMaxFolderish()}. 061 */ 062 protected static boolean nonUniformRepartition = false; 063 064 public static final int MAX_DEPTH = 8; 065 066 public static final int DEFAULT_NB_DATA_NODES_PER_FOLDER = 100; 067 068 /** 069 * Used to generate a big number of children nodes when {@link #nonUniformRepartition} is {@code true}. 070 */ 071 public static final int BIG_NB_NODES_FACTOR = 50; 072 073 /** 074 * Used to generate a small number of children nodes when {@link #nonUniformRepartition} is {@code true}. 075 */ 076 public static final int SMALL_NB_BODES_DIVIDER = DEFAULT_NB_DATA_NODES_PER_FOLDER; 077 078 protected static int minGlobalFolders = 0; 079 080 protected static int minFoldersPerNode = 0; 081 082 protected static AtomicInteger nbNodes; 083 084 protected static AtomicInteger nbFolders; 085 086 protected static AtomicInteger nbVisitedFolders; 087 088 protected static AtomicLong size; 089 090 protected static final Random RANDOM = new Random(); // NOSONAR (doesn't need cryptographic strength) 091 092 protected String name; 093 094 protected boolean folderish; 095 096 protected int level = 0; 097 098 protected int idx = 0; 099 100 protected static Integer blobSizeInKB; 101 102 protected List<SourceNode> cachedChildren = null; 103 104 protected static final boolean CACHE_CHILDREN = false; 105 106 protected boolean onlyText = true; 107 108 protected boolean withProperties = false; 109 110 protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order", 111 "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure", 112 "report", "internshipReport", "pressReview" }; 113 114 protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture", 115 "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math", 116 "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport", 117 "technology/it" }; 118 119 protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL", 120 "FreeBSD", "CC0" }; 121 122 protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" }; 123 124 protected static final String[] DC_SOURCE = { "internal", "external", "unknown" }; 125 126 protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain", 127 "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" }; 128 129 public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText, boolean withProperties) { 130 this.folderish = folderish; 131 this.level = level; 132 this.idx = idx; 133 this.onlyText = onlyText; 134 this.withProperties = withProperties; 135 } 136 137 public RandomTextSourceNode(boolean folderish, int level, int idx, boolean onlyText) { 138 this(folderish, level, idx, onlyText, false); 139 } 140 141 public static RandomTextSourceNode init(int maxSize) { 142 return init(maxSize, null, true); 143 } 144 145 public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText) { 146 return init(maxSize, blobSizeInKB, onlyText, false, false, null); 147 } 148 149 public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText, boolean nonUniform, 150 boolean withProperties, String lang) { 151 return init(maxSize, blobSizeInKB, onlyText, new HunspellDictionaryHolder(lang), nonUniform, 152 withProperties); 153 } 154 155 public static RandomTextSourceNode init(int maxSize, Integer blobSizeInKB, boolean onlyText, 156 DictionaryHolder dictionaryHolder, boolean nonUniform, boolean withProperties) { 157 gen = new RandomTextGenerator(dictionaryHolder); 158 gen.prefilCache(); 159 maxNode = maxSize; 160 nbNodes = new AtomicInteger(0); 161 nbFolders = new AtomicInteger(1); 162 nbVisitedFolders = new AtomicInteger(0); 163 size = new AtomicLong(0); 164 RandomTextSourceNode.blobSizeInKB = blobSizeInKB; 165 minGlobalFolders = maxNode / DEFAULT_NB_DATA_NODES_PER_FOLDER; 166 minFoldersPerNode = 1 + (int) Math.pow(minGlobalFolders, (1.0 / MAX_DEPTH)); 167 nonUniformRepartition = nonUniform; 168 return new RandomTextSourceNode(true, 0, 0, onlyText, withProperties); 169 } 170 171 protected String getBlobMimeType() { 172 if (onlyText) { 173 return "text/plain"; 174 } else { 175 return "text/partial"; 176 } 177 } 178 179 private String capitalize(final String line) { 180 return Character.toUpperCase(line.charAt(0)) + line.substring(1); 181 } 182 183 @Override 184 public BlobHolder getBlobHolder() { 185 String content = null; 186 if (folderish) { 187 if (withProperties) { 188 return new SimpleBlobHolderWithProperties((Blob) null, getRandomProperties(content)); 189 } 190 return null; 191 } 192 if (blobSizeInKB == null) { 193 content = gen.getRandomText(); 194 } else { 195 content = gen.getRandomText(blobSizeInKB); 196 } 197 size.addAndGet(content.length()); 198 Blob blob = Blobs.createBlob(content, getBlobMimeType(), null, getName() + ".txt"); 199 if (withProperties) { 200 return new SimpleBlobHolderWithProperties(blob, getRandomProperties(content)); 201 } 202 return new SimpleBlobHolder(blob); 203 } 204 205 protected Map<String, Serializable> getRandomProperties(String content) { 206 Map<String, Serializable> ret = new HashMap<>(); 207 ret.put("dc:title", capitalize(getName())); 208 if (RANDOM.nextInt(10) == 1) { 209 String description; 210 if (content != null && ! content.isEmpty()) { 211 description = content.substring(0, content.indexOf(' ', 40)); 212 } else { 213 description = gen.getRandomTitle(RANDOM.nextInt(5)+1); 214 } 215 ret.put("dc:description", capitalize(description)); 216 } 217 ret.put("dc:nature", getGaussian(DC_NATURE)); 218 ret.put("dc:subjects", (Serializable) Arrays.asList(getGaussian(DC_SUBJECTS))); 219 ret.put("dc:rights", getGaussian(DC_RIGHTS)); 220 ret.put("dc:language", getGaussian(DC_LANGUAGE)); 221 ret.put("dc:coverage", getGaussian(DC_COVERAGE)); 222 ret.put("dc:source", getGaussian(DC_SOURCE)); 223 // validation contraint violation 224 // ret.put("dc:creator", String.format("user%03d", hazard.nextInt(500))); 225 return ret; 226 } 227 228 protected String getGaussian(String[] words) { 229 double g = Math.abs(RANDOM.nextGaussian() / 4); 230 g = Math.min(g, 1); 231 int i = (int) Math.floor(g * (words.length - 1)); 232 return words[ i ]; 233 } 234 235 protected int getMidRandom(int target) { 236 return 1 + (target / 2) + RANDOM.nextInt(target); 237 } 238 239 /** 240 * Allows to get a non uniform distribution of the number of nodes per folder. Returns: 241 * <ul> 242 * <li>A small number of nodes 10% of the time, see {@link #SMALL_NB_BODES_DIVIDER}.</li> 243 * <li>A big number of nodes 10% of the time, see {@link #BIG_NB_NODES_FACTOR}.</li> 244 * <li>A random variation of the target number of nodes 80% of the time.</li> 245 * </ul> 246 */ 247 protected int getNonUniform(int target, boolean folderish) { 248 int res; 249 int remainder = nbVisitedFolders.get() % 10; 250 if (remainder == 8) { 251 res = 1 + target / SMALL_NB_BODES_DIVIDER; 252 if (log.isDebugEnabled()) { 253 String nodeStr; 254 if (folderish) { 255 nodeStr = "folderish"; 256 } else { 257 nodeStr = "data"; 258 } 259 log.debug(String.format("### Small number of %s nodes: %d", nodeStr, res)); 260 } 261 } else if (remainder == 9) { 262 int factor; 263 // Big number of folderish nodes is 10 times smaller than the big number of data nodes 264 if (folderish) { 265 factor = BIG_NB_NODES_FACTOR / 10; 266 } else { 267 factor = BIG_NB_NODES_FACTOR; 268 } 269 res = 1 + target * factor; 270 if (log.isDebugEnabled()) { 271 String nodeStr; 272 if (folderish) { 273 nodeStr = "folderish"; 274 } else { 275 nodeStr = "data"; 276 } 277 log.debug(String.format("### Big number of %s nodes: %d", nodeStr, res)); 278 } 279 } else { 280 res = getMidRandom(target); 281 } 282 return res; 283 } 284 285 protected int getMaxChildren() { 286 if (maxNode < nbNodes.get()) { 287 return 0; 288 } 289 int targetRemainingFolders = minGlobalFolders - nbFolders.get(); 290 if (targetRemainingFolders <= 0) { 291 return DEFAULT_NB_DATA_NODES_PER_FOLDER + 1; 292 } 293 int target = ((maxNode - nbNodes.get()) / targetRemainingFolders); 294 if (target <= 0) { 295 return 0; 296 } 297 if (nonUniformRepartition) { 298 return getNonUniform(target, false); 299 } else { 300 return getMidRandom(target); 301 } 302 } 303 304 protected int getMaxFolderish() { 305 if (maxNode <= nbNodes.get()) { 306 return 0; 307 } 308 if (nonUniformRepartition) { 309 return getNonUniform(minFoldersPerNode, true); 310 } else { 311 return getMidRandom(minFoldersPerNode); 312 } 313 } 314 315 @Override 316 public List<SourceNode> getChildren() { 317 318 if (!folderish) { 319 return null; 320 } 321 322 if (cachedChildren != null) { 323 return cachedChildren; 324 } 325 326 List<SourceNode> children = new ArrayList<>(); 327 if (nbNodes.get() > maxNode) { 328 return children; 329 } 330 331 int nbChildren = getMaxChildren(); 332 for (int i = 0; i < nbChildren; i++) { 333 children.add(new RandomTextSourceNode(false, level, i, onlyText, withProperties)); 334 } 335 nbNodes.addAndGet(nbChildren); 336 if (log.isDebugEnabled()) { 337 String nodeStr; 338 if (nbChildren > 1) { 339 nodeStr = "nodes"; 340 } else { 341 nodeStr = "node"; 342 } 343 log.debug(String.format("Added %s data %s to %s; data node total count = %s", nbChildren, nodeStr, 344 getName(), nbNodes)); 345 } 346 347 if (level < MAX_DEPTH) { 348 // In the case of a non uniform repartition, don't add folderish nodes if there are no data nodes to not 349 // overload the tree with folderish nodes that would probably be empty 350 if (!nonUniformRepartition || nbChildren > 0) { 351 int nbFolderish = getMaxFolderish(); 352 for (int i = 0; i < nbFolderish; i++) { 353 children.add(new RandomTextSourceNode(true, level + 1, i, onlyText, withProperties)); 354 } 355 nbFolders.addAndGet(nbFolderish); 356 if (log.isDebugEnabled()) { 357 String nodeStr; 358 if (nbFolderish > 1) { 359 nodeStr = "nodes"; 360 } else { 361 nodeStr = "node"; 362 } 363 log.debug(String.format("Added %s folderish %s to %s; folderish node total count = %s", 364 nbFolderish, nodeStr, getName(), nbFolders)); 365 } 366 } 367 } 368 if (CACHE_CHILDREN) { 369 cachedChildren = children; 370 } 371 372 nbVisitedFolders.incrementAndGet(); 373 if (log.isDebugEnabled()) { 374 String folderStr; 375 if (nbVisitedFolders.get() > 1) { 376 folderStr = "folders"; 377 } else { 378 folderStr = "folder"; 379 } 380 log.debug(String.format("Visited %s %s", nbVisitedFolders, folderStr)); 381 } 382 383 return children; 384 } 385 386 @Override 387 public String getName() { 388 if (name == null) { 389 if (withProperties) { 390 name = gen.getRandomTitle(RANDOM.nextInt(3)+1); 391 } 392 else { 393 if (folderish) { 394 name = "folder"; 395 } else { 396 name = "file"; 397 } 398 if (level == 0 && folderish) { 399 name = name + "-" + (System.currentTimeMillis() % 10000) + RANDOM.nextInt(100); 400 } else { 401 name = name + "-" + level + "-" + idx; 402 } 403 } 404 } 405 return name; 406 } 407 408 @Override 409 public boolean isFolderish() { 410 return folderish; 411 } 412 413 public static Integer getNbNodes() { 414 return nbNodes.get(); 415 } 416 417 public static Long getSize() { 418 return size.get(); 419 } 420 421 public int getLevel() { 422 return level; 423 } 424 425 @Override 426 public String getSourcePath() { 427 return null; 428 } 429}