Source code

001/*
002 * (C) Copyright 2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Olivier Grisel
016 */
017package org.nuxeo.ecm.platform.categorization.categorizer.tfidf;
018
019import java.io.Serializable;
020import java.util.List;
021
022/**
023 * Hashed vector representation of the token unigrams and bigrams of a document provided as a sequence of tokens.
024 * <p>
025 * We use a hash representations to be able to maintain low memory requirements by avoiding to store an explicit map
026 * from string bigrams to feature vector index in memory.
027 * <p>
028 * http://hunch.net/~jl/projects/hash_reps/index.html http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
029 *
030 * @author ogrisel
031 */
032public class HashingVectorizer implements Serializable {
033
034    private static final long serialVersionUID = 1L;
035
036    protected int dim = 524288; // 2 ** 19
037
038    protected int probes = 2;
039
040    protected int window = 0;
041
042    /**
043     * Chain configuration of the number of buckets, which is also the number of the vectors dimensions, small values
044     * mean high probabilities of collisions.
045     */
046    public HashingVectorizer dimension(int dim) {
047        this.dim = dim;
048        return this;
049    }
050
051    /**
052     * Chain configuration of the number of terms to hash together: window = 1 means unigrams and bigrams, window = 3
053     * would add bigrams of distance 2, and so on.
054     */
055    public HashingVectorizer window(int window) {
056        this.window = window;
057        return this;
058    }
059
060    /**
061     * Chain configuration of the number of probes, i.e. number of distinct hash functions to use for each ngram count.
062     */
063    public HashingVectorizer probes(int probes) {
064        this.probes = probes;
065        return this;
066    }
067
068    // TODO: implement a sparse equivalent using an long[] for the indices of
069    // non-zero values and a second count[] for the counts them-selves
070    public long[] count(List<String> tokens) {
071        long[] counts = new long[dim];
072        addCounts(tokens, counts);
073        return counts;
074    }
075
076    public void addCounts(List<String> tokens, long[] counts) {
077        int n = 0;
078        for (String token : tokens) {
079            for (int probe = 0; probe < probes; probe++) {
080                counts[hash(token, probe)]++;
081            }
082            if (window > 0) {
083                for (int j = Math.max(0, n - window); j < n; j++) {
084                    for (int probe = 0; probe < probes; probe++) {
085                        counts[hash(token, tokens.get(j), probe)]++;
086                    }
087                }
088            }
089            n++;
090        }
091    }
092
093    protected int hash(String token, int probe) {
094        return hash(token, null, probe);
095    }
096
097    protected int hash(String token, String prevToken, int probe) {
098        int h = (token + " " + prevToken + " " + probe).hashCode() % dim;
099        if (h < 0) {
100            h += dim;
101        }
102        return h;
103    }
104
105}