Source code for limbo.cli.stats

# Copyright 2021 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains
# certain rights in this software.

"""Implements the :ref:`limbo-stats` command."""

import argparse
import itertools
import logging
import os
import sys

import numpy
import scipy.sparse
import tqdm

import limbo.data



[docs]
def argument_parser():
    parser = argparse.ArgumentParser(description="Print information about Limbo datasets.")
    parser.add_argument("--annotations", action="store_true", help="Display annotation statistics.")
    parser.add_argument("--copyright", action="store_true", help="Display copyright statistics.")
    parser.add_argument("--empty-bbox", action="store_true", help="Display samples that have empty bounding boxes.")
    parser.add_argument("--license", action="store_true", help="Display license statistics.")
    parser.add_argument("--license-csv", action="store_true", help="Display license statistics as CSV data.")
    parser.add_argument("datadir", nargs="+", default=[], help="Limbo dataset director(ies).")
    return parser




[docs]
def main():
    parser = argument_parser()
    arguments = parser.parse_args()

    logging.basicConfig(level=logging.INFO, format="%(message)s")
    logging.getLogger("imagecat").setLevel(logging.WARN)

    logging.info("Extracting statistics from:")
    for path in arguments.datadir:
        logging.info(f"  {path}")

    dataset = limbo.data.Dataset(arguments.datadir)

    # Setup column indices for a sparse matrix.
    column = itertools.count()

    bbox_column = next(column)
    empty_bbox_column = next(column)
    category_columns = {}
    contour_column = next(column)
    copyright_columns = {}
    cryptomatte_column = next(column)
    image_column = next(column)
    index_column = next(column)
    license_columns = {}
    provenance_column = next(column)
    synthetic_column = next(column)
    tag_column = next(column)

    # Iterate over samples, incrementally populating a sparse matrix in coordinate format.
    i = []
    j = []
    data = []

    def mark(row, col, value=1):
        i.append(row)
        j.append(col)
        data.append(value)

    for row, sample in enumerate(tqdm.tqdm(dataset, desc="Samples", unit="sample")):
        mark(row, index_column) # This guarantees that there will be a row for every sample in the sparse data.

        for category in sample.categories:
            if category not in category_columns:
                category_columns[category] = next(column)
            mark(row, category_columns[category])

        if "annotations" in sample.metadata:
            for annotation in sample.metadata["annotations"]:
                if "bbox" in annotation:
                    mark(row, bbox_column)
                    x, y, width, height = annotation["bbox"]
                    if width == 0 and height == 0:
                        mark(row, empty_bbox_column)
                if "contours" in annotation:
                    mark(row, contour_column)
                else:
                    mark(row, tag_column)

        if "image" in sample.metadata:
            mark(row, image_column)

        if "provenance" in sample.metadata:
            mark(row, provenance_column)

            copyright = sample.metadata["provenance"].get("copyright", "")
            if copyright not in copyright_columns:
                copyright_columns[copyright] = next(column)
            mark(row, copyright_columns[copyright])

            license = sample.metadata["provenance"].get("license", "")
            if license not in license_columns:
                license_columns[license] = next(column)
            mark(row, license_columns[license])

        if sample.synthetic:
            mark(row, synthetic_column)
            if sample.synthetic.cryptomatte:
                mark(row, cryptomatte_column)

    categories = sorted(category_columns.keys())
    copyrights = sorted(copyright_columns.keys())
    licenses = sorted(license_columns.keys())

    samples = scipy.sparse.coo_matrix((data, (i, j))).toarray()

    print(f"Total samples: {len(dataset)}.")
    print("")
    print(f"- Samples with provenance: {numpy.count_nonzero(samples[:,provenance_column])}")
    print(f"- Samples with images: {numpy.count_nonzero(samples[:,image_column])}")
    print(f"- Samples with synthetic data: {numpy.count_nonzero(samples[:,synthetic_column])}")
    print(f"- Samples with cryptomatte data: {numpy.count_nonzero(samples[:,cryptomatte_column])}")
    print(f"- Samples with bounding-boxes: {numpy.count_nonzero(samples[:,bbox_column])}")
    print(f"- Samples with empty bounding-boxes: {numpy.count_nonzero(samples[:,empty_bbox_column])}")
    print(f"- Samples with contours: {numpy.count_nonzero(samples[:,contour_column])}")
    print(f"- Samples with tags: {numpy.count_nonzero(samples[:,tag_column])}")
    print("")

    print("Categories:")
    print("")
    for category in categories:
        print(f"- Category *{category}* samples: {numpy.count_nonzero(samples[:,category_columns[category]])}")
    print("")

    old_columns = [category_columns[category] for category in categories]
    new_columns = [index for index, column in enumerate(old_columns)]
    new_samples = samples[:, old_columns]

    if arguments.annotations:
        print("Annotations:")
        print("")
        for configuration in numpy.unique(new_samples, axis=0):
            label = " + ".join([f"*{category}*" for category, new_column in zip(categories, new_columns) if configuration[new_column]])
            matches = numpy.all(numpy.equal(new_samples, configuration), axis=1)
            print(f"- Samples annotated {label}: {numpy.sum(matches)}")
        print("")

    if arguments.copyright:
        print("Copyrights:")
        print("")
        for copyright in copyrights:
            print(f"- Copyright *{copyright}* samples: {numpy.count_nonzero(samples[:,copyright_columns[copyright]])}")
        print("")

    if arguments.empty_bbox:
        print("Empty Bounding Boxes:")
        print("")
        indices = numpy.flatnonzero(samples[:, empty_bbox_column])
        indices = " ".join([str(index) for index in indices])
        print(indices)

    if arguments.license:
        print("Licenses:")
        print("")
        for license in licenses:
            print(f"- License *{license}* samples: {numpy.count_nonzero(samples[:,license_columns[license]])}")
        print("")

    if arguments.license_csv:
        print("count,original,indices\n")
        for license in licenses:
            indices = numpy.flatnonzero(samples[:, license_columns[license]])
            indices = " ".join([str(index) for index in indices])
            print(f"{numpy.count_nonzero(samples[:,license_columns[license]])},\"{license}\",{indices}\n")