001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.nuxeo.importer.stream.producer; 018 019import java.io.Serializable; 020import java.util.ArrayList; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Random; 026import java.util.Set; 027import java.util.concurrent.ThreadLocalRandom; 028 029import org.apache.commons.logging.Log; 030import org.apache.commons.logging.LogFactory; 031import org.nuxeo.ecm.core.api.Blob; 032import org.nuxeo.ecm.core.api.Blobs; 033import org.nuxeo.ecm.core.blob.BlobInfo; 034import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder; 035import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator; 036import org.nuxeo.importer.stream.message.DocumentMessage; 037import org.nuxeo.lib.stream.pattern.producer.AbstractProducer; 038 039/** 040 * @since 9.1 041 */ 042public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> { 043 private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class); 044 045 protected final long nbDocuments; 046 047 protected final BlobInfoFetcher blobInfoFetcher; 048 049 protected boolean countFolderAsDocument = true; 050 051 protected int maxFoldersPerFolder = 50; 052 053 protected int maxDocumentsPerFolder = 10000; 054 055 protected int blobSizeKB = 0; 056 057 protected boolean blobOnlyText = false; 058 059 protected int documentCount = 0; 060 061 protected int folderCount = 0; 062 063 protected final Random rand; 064 065 protected static RandomTextGenerator gen; 066 067 protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order", 068 "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure", 069 "report", "internshipReport", "pressReview" }; 070 071 protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture", 072 "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math", 073 "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport", 074 "technology/it" }; 075 076 protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL", 077 "FreeBSD", "CC0" }; 078 079 protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" }; 080 081 protected static final String[] DC_SOURCE = { "internal", "external", "unknown" }; 082 083 protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain", 084 "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" }; 085 086 protected int foldersInCurrentFolderLimit; 087 088 protected int documentInCurrentFolderLimit; 089 090 protected enum DocType { 091 Root, Folder, Document 092 } 093 094 protected DocType currentType = DocType.Root; 095 096 protected int parentIndex = 0; 097 098 protected List<String> parents = new ArrayList<>(); 099 100 protected List<String> folderishChildren = new ArrayList<>(); 101 102 protected Set<String> children = new HashSet<>(); 103 104 protected int documentInCurrentFolderCount = 0; 105 106 public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang, 107 BlobInfoFetcher blobInfoFetcher) { 108 super(producerId); 109 this.nbDocuments = nbDocuments; 110 rand = ThreadLocalRandom.current(); 111 112 synchronized (RandomDocumentMessageProducer.class) { 113 if (gen == null) { 114 gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang)); 115 gen.prefilCache(); 116 } 117 } 118 this.blobInfoFetcher = blobInfoFetcher; 119 log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments); 120 } 121 122 public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) { 123 maxFoldersPerFolder = max; 124 return this; 125 } 126 127 public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) { 128 maxDocumentsPerFolder = max; 129 return this; 130 } 131 132 public RandomDocumentMessageProducer countFolderAsDocument(boolean value) { 133 countFolderAsDocument = value; 134 return this; 135 } 136 137 public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) { 138 this.blobSizeKB = sizeKB; 139 this.blobOnlyText = onlyText; 140 return this; 141 } 142 143 @Override 144 public int getPartition(DocumentMessage message, int partitions) { 145 return getProducerId() % partitions; 146 } 147 148 @Override 149 public boolean hasNext() { 150 if (countFolderAsDocument) { 151 return (documentCount + folderCount) < nbDocuments; 152 } 153 return documentCount < nbDocuments; 154 } 155 156 @Override 157 public DocumentMessage next() { 158 DocumentMessage ret; 159 switch (currentType) { 160 case Root: 161 ret = createRoot(); 162 parents.add(ret.getId()); 163 currentType = DocType.Folder; 164 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 165 break; 166 case Folder: 167 ret = createFolder(parents.get(parentIndex), children); 168 folderishChildren.add(ret.getId()); 169 children.add(ret.getName()); 170 if (folderishChildren.size() >= foldersInCurrentFolderLimit) { 171 currentType = DocType.Document; 172 documentInCurrentFolderCount = 0; 173 documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder); 174 } 175 break; 176 default: 177 case Document: 178 ret = createDocument(parents.get(parentIndex), children); 179 children.add(ret.getName()); 180 documentInCurrentFolderCount += 1; 181 if (documentInCurrentFolderCount > documentInCurrentFolderLimit) { 182 parentIndex += 1; 183 if (parentIndex >= parents.size()) { 184 parents.clear(); 185 parents = folderishChildren; 186 folderishChildren = new ArrayList<>(); 187 children = new HashSet<>(); 188 parentIndex = 0; 189 } 190 currentType = DocType.Folder; 191 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 192 } 193 break; 194 } 195 // log.debug(ret.getType() + ": " + ret.getId()); 196 return ret; 197 } 198 199 protected DocumentMessage createRoot() { 200 folderCount++; 201 return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", ""); 202 } 203 204 protected DocumentMessage createFolder(String parentPath, Set<String> exclude) { 205 DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude); 206 folderCount++; 207 return node; 208 } 209 210 protected DocumentMessage createDocument(String parentPath, Set<String> exclude) { 211 DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude); 212 documentCount++; 213 return node; 214 } 215 216 protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob, 217 Set<String> exclude) { 218 DocumentMessage node = getRandomNode(type, parentPath, withBlob); 219 String name = node.getName(); 220 if (exclude.contains(name)) { 221 String newName = name + "-" + rand.nextInt(exclude.size()); 222 node = DocumentMessage.copy(node, newName); 223 } 224 return node; 225 } 226 227 protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) { 228 String title = getTitle(); 229 String name = getName(title); 230 HashMap<String, Serializable> props = getRandomProperties(title); 231 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 232 if (withBlob) { 233 if (blobInfoFetcher != null) { 234 BlobInfo blobInfo = blobInfoFetcher.get(builder); 235 if (blobInfo != null) { 236 builder.setBlobInfo(blobInfo); 237 if (blobInfo.mimeType != null) { 238 builder.setType(getDocumentTypeForMimeType(blobInfo.mimeType)); 239 } 240 } 241 } else { 242 builder.setBlob(getRandomBlob()); 243 } 244 } 245 return builder.build(); 246 } 247 248 protected String getDocumentTypeForMimeType(String mimeType) { 249 if (mimeType.startsWith("image")) { 250 return "Picture"; 251 } 252 if (mimeType.startsWith("video")) { 253 return "Video"; 254 } 255 return "File"; 256 } 257 258 protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) { 259 String title = getTitle(); 260 String name = prefix + getName(title); 261 HashMap<String, Serializable> props = getRandomProperties(title); 262 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 263 return builder.build(); 264 } 265 266 protected Blob getRandomBlob() { 267 if (blobSizeKB == 0) { 268 return null; 269 } 270 String content = gen.getRandomText(blobSizeKB); 271 return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt"); 272 } 273 274 protected String getBlobMimeType() { 275 if (blobOnlyText) { 276 return "text/plain"; 277 } else { 278 return "text/partial"; 279 } 280 } 281 282 protected String getName(String title) { 283 return title.replaceAll("\\W+", "-").toLowerCase(); 284 } 285 286 protected String getTitle() { 287 return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim()); 288 // return "f" + folderCount; 289 } 290 291 protected String capitalize(final String line) { 292 return Character.toUpperCase(line.charAt(0)) + line.substring(1); 293 } 294 295 protected HashMap<String, Serializable> getRandomProperties(String title) { 296 HashMap<String, Serializable> ret = new HashMap<>(); 297 ret.put("dc:title", title); 298 if (rand.nextInt(10) == 1) { 299 String description = gen.getRandomTitle(rand.nextInt(5) + 1); 300 ret.put("dc:description", capitalize(description)); 301 } 302 ret.put("dc:nature", getGaussian(DC_NATURE)); 303 ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS))); 304 ret.put("dc:rights", getGaussian(DC_RIGHTS)); 305 ret.put("dc:language", getGaussian(DC_LANGUAGE)); 306 ret.put("dc:coverage", getGaussian(DC_COVERAGE)); 307 ret.put("dc:source", getGaussian(DC_SOURCE)); 308 return ret; 309 } 310 311 protected String getGaussian(String[] words) { 312 double g = Math.abs(rand.nextGaussian() / 4); 313 g = Math.min(g, 1); 314 int i = (int) Math.floor(g * (words.length - 1)); 315 return words[i]; 316 } 317 318 @Override 319 public void close() throws Exception { 320 super.close(); 321 if (blobInfoFetcher != null) { 322 blobInfoFetcher.close(); 323 } 324 } 325 326}