Table of Contents

Mongo Collection Archeology

This page could give some insights on some central mongo collections

python code snippet

There is room for improvement, but here is my code snippet to produce the results below.

from pymongo import MongoClient

def get_types(doc, parent=""):

    result = []
    if isinstance(doc, dict):
        for key in doc:
            if parent:
                key_path = f"{parent}.{key}"
            else:
                key_path = key
            vtype = type(doc[key])
            result.append((key_path, vtype))
            if (vtype == dict or vtype == list) and doc[key]:
                sub_results = get_types(doc[key], key_path)
                result += sub_results
    elif isinstance(doc, list):
        for item in doc:
            key_path = f"{parent}.list_of"
            result += get_types(item, key_path)
    else:
        return [(parent, type(doc))]

    return result


def print_results(types, counts):
    results = sorted([(k, v, counts[k]) for k, v in types.items()], key=lambda x: x[-1])
    for k, v, c in results:
        print(k, v, c)


def main():
    mclient = MongoClient("mongodb://mongo_uri")
    db = mclient.meta_db
    col = db.meta_db_collection

    types = {}
    counts = {}

    for count, doc in enumerate(col.find()):
        for key, vtype in get_types(doc):
            if key not in types:
                types[key] = set()
            types[key].add(vtype)
            if key not in counts:
                counts[key] = 0
            counts[key] += 1

        if count % 100000 == 0:
            print("-"*100)
            print(count)
            print_results(types, counts)

    print_results(types, counts)


if __name__ == "__main__":
    main()

Result on meta_db_collection

metadata.sort_key {<class 'int'>} 37218
metadata.x_model_number {<class 'str'>} 37218
metadata.x_promotion_nr {<class 'str'>} 37218
metadata.material {<class 'str'>} 56828
metadata.extraImgUrl {<class 'str'>} 57081
metadata.shop {<class 'str'>} 60991
metadata.season {<class 'str'>} 69154
metadata.material_feed {<class 'str'>} 94635
metadata.ean {<class 'str'>} 116994
metadata.strike_price {<class 'str'>} 133602
metadata.available {<class 'int'>} 133636
metadata.avail_code {<class 'str'>} 137305
metadata.sizes {<class 'str'>} 137305
metadata.stock_amount_color {<class 'str'>} 137305
metadata.available_sizes {<class 'str'>} 212528
metadata.max_price {<class 'int'>} 212528
metadata.min_price {<class 'int'>} 212528
metadata.x_sizes {<class 'str'>, <class 'list'>} 212528
metadata.size {<class 'list'>} 354791
metadata.size.list_of {<class 'str'>} 357933
metadata.country_code {<class 'str'>} 440848
metadata.is_sale {<class 'bool'>} 514038
metadata.x_sizes.list_of.size {<class 'str'>} 775221
metadata.x_sizes.list_of.price {<class 'int'>} 775221
metadata.x_sizes.list_of.available {<class 'int'>} 775221
metadata.old_price {<class 'int'>, <class 'bson.int64.Int64'>} 7275431
metadata.color {<class 'str'>} 10025321
metadata.position {<class 'list'>} 10911335
metadata.short_tease {<class 'str'>} 11284945
metadata.gender {<class 'str'>} 16577742
created {<class 'int'>} 17680385
metadata.deeplink {<class 'str'>} 17833292
metadata.brand {<class 'str'>} 19029590
metadata.name {<class 'str'>} 19127343
_id {<class 'bson.objectid.ObjectId'>} 21106844
picalike_id {<class 'str'>} 21106844
last_visit {<class 'int'>} 21106844
metadata {<class 'dict'>} 21106844
metadata.category {<class 'list'>} 21106844
metadata.images {<class 'list'>} 21106844
metadata.price {<class 'int'>, <class 'bson.int64.Int64'>} 21106844
metadata.prod_id {<class 'str'>} 21106844
prod_id {<class 'str'>} 21106844
shop_id {<class 'str'>} 21106844
metadata.position.list_of {<class 'float'>} 45722368
metadata.images.list_of {<class 'str'>} 58947032
metadata.category.list_of.list_of {<class 'str'>} 200744457

Format of the result is: “key_path” “set_of_types” “number_of_occurences”

:!: This is missing some explanation what those fields are