====== Mongo Collection Archeology ======
This page could give some insights on some central mongo collections
==== python code snippet ====
There is room for improvement, but here is my code snippet to produce the results below.
from pymongo import MongoClient
def get_types(doc, parent=""):
result = []
if isinstance(doc, dict):
for key in doc:
if parent:
key_path = f"{parent}.{key}"
else:
key_path = key
vtype = type(doc[key])
result.append((key_path, vtype))
if (vtype == dict or vtype == list) and doc[key]:
sub_results = get_types(doc[key], key_path)
result += sub_results
elif isinstance(doc, list):
for item in doc:
key_path = f"{parent}.list_of"
result += get_types(item, key_path)
else:
return [(parent, type(doc))]
return result
def print_results(types, counts):
results = sorted([(k, v, counts[k]) for k, v in types.items()], key=lambda x: x[-1])
for k, v, c in results:
print(k, v, c)
def main():
mclient = MongoClient("mongodb://mongo_uri")
db = mclient.meta_db
col = db.meta_db_collection
types = {}
counts = {}
for count, doc in enumerate(col.find()):
for key, vtype in get_types(doc):
if key not in types:
types[key] = set()
types[key].add(vtype)
if key not in counts:
counts[key] = 0
counts[key] += 1
if count % 100000 == 0:
print("-"*100)
print(count)
print_results(types, counts)
print_results(types, counts)
if __name__ == "__main__":
main()
==== Result on meta_db_collection ====
metadata.sort_key {} 37218
metadata.x_model_number {} 37218
metadata.x_promotion_nr {} 37218
metadata.material {} 56828
metadata.extraImgUrl {} 57081
metadata.shop {} 60991
metadata.season {} 69154
metadata.material_feed {} 94635
metadata.ean {} 116994
metadata.strike_price {} 133602
metadata.available {} 133636
metadata.avail_code {} 137305
metadata.sizes {} 137305
metadata.stock_amount_color {} 137305
metadata.available_sizes {} 212528
metadata.max_price {} 212528
metadata.min_price {} 212528
metadata.x_sizes {, } 212528
metadata.size {} 354791
metadata.size.list_of {} 357933
metadata.country_code {} 440848
metadata.is_sale {} 514038
metadata.x_sizes.list_of.size {} 775221
metadata.x_sizes.list_of.price {} 775221
metadata.x_sizes.list_of.available {} 775221
metadata.old_price {, } 7275431
metadata.color {} 10025321
metadata.position {} 10911335
metadata.short_tease {} 11284945
metadata.gender {} 16577742
created {} 17680385
metadata.deeplink {} 17833292
metadata.brand {} 19029590
metadata.name {} 19127343
_id {} 21106844
picalike_id {} 21106844
last_visit {} 21106844
metadata {} 21106844
metadata.category {} 21106844
metadata.images {} 21106844
metadata.price {, } 21106844
metadata.prod_id {} 21106844
prod_id {} 21106844
shop_id {} 21106844
metadata.position.list_of {} 45722368
metadata.images.list_of {} 58947032
metadata.category.list_of.list_of {} 200744457
Format of the result is: “key_path” “set_of_types” “number_of_occurences”
* key_path: a dot in the key_path indicates a nested dictionary and a ''%%list_of%%'' describes the contents of a list
* set_of_types: simply the python type
* number_of_occurences: counting the number of times the field occured in the documents “_id” indicates the total number of documents
{{/dokuwiki/lib/images/smileys/icon_exclaim.gif|:!:}} This is missing some explanation what those fields are