Source code

001/*
002 * (C) Copyright 2009 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Olivier Grisel
018 */
019package org.nuxeo.ecm.platform.categorization.categorizer.tfidf;
020
021import java.io.Serializable;
022import java.util.List;
023
024/**
025 * Hashed vector representation of the token unigrams and bigrams of a document provided as a sequence of tokens.
026 * <p>
027 * We use a hash representations to be able to maintain low memory requirements by avoiding to store an explicit map
028 * from string bigrams to feature vector index in memory.
029 * <p>
030 * http://hunch.net/~jl/projects/hash_reps/index.html http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
031 *
032 * @author ogrisel
033 */
034public class HashingVectorizer implements Serializable {
035
036    private static final long serialVersionUID = 1L;
037
038    protected int dim = 524288; // 2 ** 19
039
040    protected int probes = 2;
041
042    protected int window = 0;
043
044    /**
045     * Chain configuration of the number of buckets, which is also the number of the vectors dimensions, small values
046     * mean high probabilities of collisions.
047     */
048    public HashingVectorizer dimension(int dim) {
049        this.dim = dim;
050        return this;
051    }
052
053    /**
054     * Chain configuration of the number of terms to hash together: window = 1 means unigrams and bigrams, window = 3
055     * would add bigrams of distance 2, and so on.
056     */
057    public HashingVectorizer window(int window) {
058        this.window = window;
059        return this;
060    }
061
062    /**
063     * Chain configuration of the number of probes, i.e. number of distinct hash functions to use for each ngram count.
064     */
065    public HashingVectorizer probes(int probes) {
066        this.probes = probes;
067        return this;
068    }
069
070    // TODO: implement a sparse equivalent using an long[] for the indices of
071    // non-zero values and a second count[] for the counts them-selves
072    public long[] count(List<String> tokens) {
073        long[] counts = new long[dim];
074        addCounts(tokens, counts);
075        return counts;
076    }
077
078    public void addCounts(List<String> tokens, long[] counts) {
079        int n = 0;
080        for (String token : tokens) {
081            for (int probe = 0; probe < probes; probe++) {
082                counts[hash(token, probe)]++;
083            }
084            if (window > 0) {
085                for (int j = Math.max(0, n - window); j < n; j++) {
086                    for (int probe = 0; probe < probes; probe++) {
087                        counts[hash(token, tokens.get(j), probe)]++;
088                    }
089                }
090            }
091            n++;
092        }
093    }
094
095    protected int hash(String token, int probe) {
096        return hash(token, null, probe);
097    }
098
099    protected int hash(String token, String prevToken, int probe) {
100        int h = (token + " " + prevToken + " " + probe).hashCode() % dim;
101        if (h < 0) {
102            h += dim;
103        }
104        return h;
105    }
106
107}