Codebits by David Medinets

After dumping random datasets into my D4M schema, I was curious about the cardinality of the fields. I wanted to answer the question:

How many different first names exist?

Not wanting to overload RAM with a large HashMap I investigated alternatives and settled on the HyperLogLog approach used by the _streamlib_ project from AddThis. You can add the jar files to your project using this Maven dependency:


<dependency>
<groupId>com.clearspring.analytics</groupId>
<artifactId>stream</artifactId>
<version>2.6.0</version>
<type>jar</type>
</dependency>

This library is really easy to use. Just create an ICardinality object then add Longs representing values. Following is a code snippet showing the idea:

ICardinality estimator = new AdaptiveCounting(16);

// Add david
long hashCode = MurmurHash.hash64("firstname|david");
estimator.offer(hashCode);

// Add john
long hashCode = MurmurHash.hash64("firstname|john");
estimator.offer(hashCode);

System.out.println(String.format("There were %d instances of firstname.", estimator.cardinality()));

It's fairly simple to integrate the ICardinality objects into a utility to find Cardinality for all field names using the TedgeDegree table.

package com.codebits.examples.d4m;

import com.clearspring.analytics.hash.MurmurHash;
import com.clearspring.analytics.stream.cardinality.AdaptiveCounting;
import com.clearspring.analytics.stream.cardinality.ICardinality;
import com.codebits.d4m.PropertyManager;
import com.codebits.d4m.TableManager;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.TreeMap;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;

public class FieldCardinality {

    private final String factDelimiter = "|";
    private static final Charset charset = Charset.defaultCharset();

    public static void main(String[] args) throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
        FieldCardinality driver = new FieldCardinality();
        driver.process(args);
    }

    public void process(String[] args) throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {

        PropertyManager propertyManager = new PropertyManager();
        propertyManager.setPropertyFilename("d4m.properties");
        Properties properties = propertyManager.load();

        String instanceName = properties.getProperty("accumulo.instance.name");
        String zooKeepers = properties.getProperty("accumulo.zookeeper.ensemble");
        String user = properties.getProperty("accumulo.user");
        byte[] pass = properties.getProperty("accumulo.password").getBytes(charset);

        ZooKeeperInstance instance = new ZooKeeperInstance(instanceName, zooKeepers);
        Connector connector = instance.getConnector(user, pass);

        TableManager tableManager = new TableManager(connector.tableOperations());
        String tableName = tableManager.getDegreeTable();

        Map<String, ICardinality> estimators = new TreeMap<String, ICardinality>();

        Scanner scan = connector.createScanner(tableName, new Authorizations());
        Iterator<Map.Entry<Key, Value>> iterator = scan.iterator();
        while (iterator.hasNext()) {
            Map.Entry<Key, Value> entry = iterator.next();

            String row = entry.getKey().getRow().toString();
            String factName = row.substring(0, row.indexOf(factDelimiter));

            long hashCode = MurmurHash.hash64(row);

            ICardinality estimator = estimators.get(factName);
            if (estimator == null) {
                estimator = new AdaptiveCounting(16);
                estimators.put(factName, estimator);
            }

            estimator.offer(hashCode);
        }

        for (Entry<String, ICardinality> entry : estimators.entrySet()) {
            String factName = entry.getKey();
            ICardinality estimator = entry.getValue();
            System.out.println(String.format("%s: %d", factName, estimator.cardinality()));
        }
    }
}

Clever readers will be complaining that I don't need to store a HashMap of ICardinality object because field names are stored lexically sorted. A single ICardinality object can be reset which each field name is seen. However, this program is just the first step. I'll try to serialize the estimators so they can be re-used in the future. The idea is that only new Accumulo entries need to be visited.