Source code for wikirepo.data.lctn_utils

"""
Location Utilities
------------------

Utility functions for querying locations.

Contents
    lctn_to_qid_dict,
    qid_to_lctn_dict,
    incl_lctn_lbls,
    incl_lctn_ids,
    lctn_lbl_to_qid,
    qid_tp_lctn_lbl,
    depth_to_col_name,
    depth_to_cols,
    depth_to_qid_col_name,
    depth_to_qid_cols,
    find_qid_get_depth,
    get_qids_at_depth,
    iter_set_dict,
    gen_lctns_dict,
    derive_depth,
    merge_lctn_dicts,
    find_key_items

    LocationsDict Class
        __init__,
        __repr__,
        __str__,
        key_lbls_list,
        get_depth,
        iter_key_items,
        iter_set,
        get_qids_at_depth,
        _print
"""

from pandas.core.common import flatten
from tqdm.auto import tqdm
from wikirepo import utils
from wikirepo.data import wd_utils


[docs]def lctn_to_qid_dict(): """ Queries a dictionary that links a location's name to its WikiData QID. """ return { "Earth": "Q2", "Africa": "Q15", "Antarctica": "Q51", "Asia": "Q48", "Europe": "Q46", "North America": "Q49", "South America": "Q18", "Oceania": "Q538", "Abkhazia": "Q23334", "Afghanistan": "Q889", "Albania": "Q222", "Algeria": "Q262", "American Samoa": "Q16641", "Andorra": "Q228", "Angola": "Q916", "Anguilla": "Q25228", "Antigua and Barbuda": "Q781", "Argentina": "Q414", "Armenia": "Q399", "Aruba": "Q21203", "Australia": "Q408", "Austria": "Q40", "Azerbaijan": "Q227", "Åland Islands": "Q5689", "Azores": "Q25263", "Bahrain": "Q398", "Bangladesh": "Q902", "Barbados": "Q244", "Belarus": "Q184", "Belgium": "Q31", "Belize": "Q242", "Benin": "Q962", "Bermuda": "Q23635", "Bhutan": "Q917", "Bolivia": "Q750", "Bonaire": "Q25396", "Bosnia and Herzegovina": "Q225", "Botswana": "Q963", "Brazil": "Q155", "British Virgin Islands": "Q25305", "Brunei": "Q921", "Bulgaria": "Q219", "Burkina Faso": "Q965", "Burundi": "Q967", "Cambodia": "Q424", "Cameroon": "Q1009", "Canada": "Q16", "Cape Verde": "Q1011", "Cayman Islands": "Q5785", "Central African Republic": "Q929", "Chad": "Q657", "Chile": "Q298", "Christmas Island": "Q31063", "Cocos (Keeling) Islands": "Q36004", "Colombia": "Q739", "Comoros": "Q970", "Cook Islands": "Q26988", "Costa Rica": "Q800", "Croatia": "Q224", "Cuba": "Q241", "Curaçao": "Q25279", "Cyprus": "Q229", "Czech Republic": "Q213", "Democratic Republic of the Congo": "Q974", "Denmark": "Q35", "Djibouti": "Q977", "Dominica": "Q784", "Dominican Republic": "Q786", "East Timor": "Q574", "East Turkestan": "Q840601", "Ecuador": "Q736", "Egypt": "Q79", "El Salvador": "Q792", "Equatorial Guinea": "Q983", "Eritrea": "Q986", "Estonia": "Q191", "Eswatini": "Q1050", "Ethiopia": "Q115", "Falkland Islands": "Q9648", "Faroe Islands": "Q4628", "Federated States of Micronesia": "Q702", "Fiji": "Q712", "Finland": "Q33", "France": "Q142", "French Polynesia": "Q30971", "Gabon": "Q1000", "Georgia": "Q230", "Germany": "Q183", "Ghana": "Q117", "Gibraltar": "Q1410", "Greece": "Q41", "Greenland": "Q223", "Grenada": "Q769", "Guadeloupe": "Q3118683", "Guam": "Q16635", "Guatemala": "Q774", "Guernsey": "Q25230", "Guinea": "Q1006", "Guinea-Bissau": "Q1007", "Guyana": "Q734", "Haiti": "Q790", "Honduras": "Q783", "Hong Kong": "Q8646", "Hungary": "Q28", "Iceland": "Q189", "India": "Q668", "Indonesia": "Q252", "Iran": "Q794", "Iraq": "Q796", "Ireland": "Q27", "Isle of Man": "Q9676", "Israel": "Q801", "Italy": "Q38", "Ivory Coast": "Q1008", "Jamaica": "Q766", "Japan": "Q17", "Jersey": "Q785", "Jordan": "Q810", "Kazakhstan": "Q232", "Kenya": "Q114", "Kiribati": "Q710", "Kosovo": "Q1246", "Kuwait": "Q817", "Kyrgyzstan": "Q813", "Laos": "Q819", "Latvia": "Q211", "Lebanon": "Q822", "Lesotho": "Q1013", "Liberia": "Q1014", "Libya": "Q1016", "Liechtenstein": "Q347", "Lithuania": "Q37", "Luxembourg": "Q32", "Madagascar": "Q1019", "Madeira": "Q26253", "Malawi": "Q1020", "Malaysia": "Q833", "Maldives": "Q826", "Mali": "Q912", "Malta": "Q233", "Marshall Islands": "Q709", "Martinique": "Q17054", "Mauritania": "Q1025", "Mauritius": "Q1027", "Mayotte": "Q17063", "Mexico": "Q96", "Moldova": "Q217", "Monaco": "Q235", "Mongolia": "Q711", "Montenegro": "Q236", "Montserrat": "Q13353", "Morocco": "Q1028", "Mozambique": "Q1029", "Myanmar": "Q836", "Namibia": "Q1030", "Nauru": "Q697", "Navassa Island": "Q25359", "Nepal": "Q837", "Netherlands": "Q55", "New Caledonia": "Q33788", "New Zealand": "Q664", "Nicaragua": "Q811", "Niger": "Q1032", "Nigeria": "Q1033", "Niue": "Q34020", "Norfolk Island": "Q31057", "North Korea": "Q423", "North Macedonia": "Q221", "Turkish Republic of Northern Cyprus": "Q23681", "Northern Mariana Islands": "Q16644", "Norway": "Q20", "Oman": "Q842", "Pakistan": "Q843", "Palau": "Q695", "State of Palestine": "Q219060", "Panama": "Q804", "Papua New Guinea": "Q691", "Paraguay": "Q733", "People's Republic of China": "Q148", "Peru": "Q419", "Philippines": "Q928", "Pitcairn Islands": "Q35672", "Poland": "Q36", "Portugal": "Q45", "Puerto Rico": "Q1183", "Qatar": "Q846", "Republic of Artsakh": "Q244165", "Republic of the Congo": "Q971", "Romania": "Q218", "Russia": "Q159", "Rwanda": "Q1037", "Saba": "Q25528", "Saint Barthélemy": "Q25362", "Saint Helena, Ascension and Tristan da Cunha": "Q192184", "Saint Kitts and Nevis": "Q763", "Saint Lucia": "Q760", "Saint Martin": "Q25596", "Saint Pierre and Miquelon": "Q34617", "Saint Vincent and the Grenadines": "Q757", "Samoa": "Q683", "San Marino": "Q238", "Saudi Arabia": "Q851", "Senegal": "Q1041", "Serbia": "Q403", "Seychelles": "Q1042", "Sierra Leone": "Q1044", "Singapore": "Q334", "Sint Eustatius": "Q26180", "Sint Maarten": "Q26273", "Slovakia": "Q214", "Slovenia": "Q215", "Solomon Islands": "Q685", "Somalia": "Q1045", "South Africa": "Q258", "South Georgia and the South Sandwich Islands": "Q35086", "South Korea": "Q884", "South Ossetia": "Q23427", "South Sudan": "Q958", "Spain": "Q29", "Sri Lanka": "Q854", "Sudan": "Q1049", "Suriname": "Q730", "Sweden": "Q34", "Switzerland": "Q39", "Syria": "Q858", "São Tomé and Príncipe": "Q1039", "Taiwan": "Q865", "Tajikistan": "Q863", "Tanzania": "Q924", "Thailand": "Q869", "The Bahamas": "Q778", "The Gambia": "Q1005", "Tibet": "Q17252", "Togo": "Q945", "Tokelau": "Q36823", "Tonga": "Q678", "Transnistria": "Q907112", "Trinidad and Tobago": "Q754", "Tunisia": "Q948", "Turkey": "Q43", "Turkmenistan": "Q874", "Turks and Caicos Islands": "Q18221", "Tuvalu": "Q672", "United States Virgin Islands": "Q11703", "Uganda": "Q1036", "Ukraine": "Q212", "United Arab Emirates": "Q878", "United Kingdom": "Q145", "United States of America": "Q30", "Uruguay": "Q77", "Uzbekistan": "Q265", "Vanuatu": "Q686", "Vatican City": "Q237", "Venezuela": "Q717", "Vietnam": "Q881", "Wallis and Futuna": "Q35555", "Western Sahara": "Q6250", "Yemen": "Q805", "Zambia": "Q953", "Zimbabwe": "Q954", }
[docs]def qid_to_lctn_dict(): """ Queries a dictionary that links a location's name to its WikiData QID. Notes ----- Keys are QIDs, and values are dictionaries of QID labels and their locational level. This could also be rewritten using Q3624078: sovereign state. """ return { "Q2": {"lbl": "Earth", "lctn_lvl": "world"}, "Q15": {"lbl": "Africa", "lctn_lvl": "continent"}, "Q51": {"lbl": "Antarctica", "lctn_lvl": "continent"}, "Q48": {"lbl": "Asia", "lctn_lvl": "continent"}, "Q46": {"lbl": "Europe", "lctn_lvl": "continent"}, "Q49": {"lbl": "North America", "lctn_lvl": "continent"}, "Q18": {"lbl": "South America", "lctn_lvl": "continent"}, "Q538": {"lbl": "Oceania", "lctn_lvl": "continent"}, "Q23334": {"lbl": "Abkhazia", "lctn_lvl": "region"}, "Q889": {"lbl": "Afghanistan", "lctn_lvl": "country"}, "Q222": {"lbl": "Albania", "lctn_lvl": "country"}, "Q262": {"lbl": "Algeria", "lctn_lvl": "country"}, "Q16641": {"lbl": "American Samoa", "lctn_lvl": "region"}, "Q228": {"lbl": "Andorra", "lctn_lvl": "country"}, "Q916": {"lbl": "Angola", "lctn_lvl": "country"}, "Q25228": {"lbl": "Anguilla", "lctn_lvl": "region"}, "Q781": {"lbl": "Antigua and Barbuda", "lctn_lvl": "country"}, "Q414": {"lbl": "Argentina", "lctn_lvl": "country"}, "Q399": {"lbl": "Armenia", "lctn_lvl": "country"}, "Q21203": {"lbl": "Aruba", "lctn_lvl": "region"}, "Q408": {"lbl": "Australia", "lctn_lvl": "country"}, "Q40": {"lbl": "Austria", "lctn_lvl": "country"}, "Q227": {"lbl": "Azerbaijan", "lctn_lvl": "country"}, "Q5689": {"lbl": "Åland Islands", "lctn_lvl": "region"}, "Q25263": {"lbl": "Azores", "lctn_lvl": "region"}, "Q398": {"lbl": "Bahrain", "lctn_lvl": "country"}, "Q902": {"lbl": "Bangladesh", "lctn_lvl": "country"}, "Q244": {"lbl": "Barbados", "lctn_lvl": "country"}, "Q184": {"lbl": "Belarus", "lctn_lvl": "country"}, "Q31": {"lbl": "Belgium", "lctn_lvl": "country"}, "Q242": {"lbl": "Belize", "lctn_lvl": "country"}, "Q962": {"lbl": "Benin", "lctn_lvl": "country"}, "Q23635": {"lbl": "Bermuda", "lctn_lvl": "region"}, "Q917": {"lbl": "Bhutan", "lctn_lvl": "country"}, "Q750": {"lbl": "Bolivia", "lctn_lvl": "country"}, "Q25396": {"lbl": "Bonaire", "lctn_lvl": "region"}, "Q225": {"lbl": "Bosnia and Herzegovina", "lctn_lvl": "country"}, "Q963": {"lbl": "Botswana", "lctn_lvl": "country"}, "Q155": {"lbl": "Brazil", "lctn_lvl": "country"}, "Q25305": {"lbl": "British Virgin Islands", "lctn_lvl": "region"}, "Q921": {"lbl": "Brunei", "lctn_lvl": "country"}, "Q219": {"lbl": "Bulgaria", "lctn_lvl": "country"}, "Q965": {"lbl": "Burkina Faso", "lctn_lvl": "country"}, "Q967": {"lbl": "Burundi", "lctn_lvl": "country"}, "Q424": {"lbl": "Cambodia", "lctn_lvl": "country"}, "Q1009": {"lbl": "Cameroon", "lctn_lvl": "country"}, "Q16": {"lbl": "Canada", "lctn_lvl": "country"}, "Q1011": {"lbl": "Cape Verde", "lctn_lvl": "country"}, "Q5785": {"lbl": "Cayman Islands", "lctn_lvl": "region"}, "Q929": {"lbl": "Central African Republic", "lctn_lvl": "country"}, "Q657": {"lbl": "Chad", "lctn_lvl": "country"}, "Q298": {"lbl": "Chile", "lctn_lvl": "country"}, "Q148": {"lbl": "People's Republic of China", "lctn_lvl": "country"}, "Q31063": {"lbl": "Christmas Island", "lctn_lvl": "region"}, "Q36004": {"lbl": "Cocos (Keeling) Islands", "lctn_lvl": "region"}, "Q739": {"lbl": "Colombia", "lctn_lvl": "country"}, "Q970": {"lbl": "Comoros", "lctn_lvl": "country"}, "Q26988": {"lbl": "Cook Islands", "lctn_lvl": "region"}, "Q800": {"lbl": "Costa Rica", "lctn_lvl": "country"}, "Q224": {"lbl": "Croatia", "lctn_lvl": "country"}, "Q241": {"lbl": "Cuba", "lctn_lvl": "country"}, "Q25279": {"lbl": "Curaçao", "lctn_lvl": "region"}, "Q229": {"lbl": "Cyprus", "lctn_lvl": "country"}, "Q213": {"lbl": "Czech Republic", "lctn_lvl": "country"}, "Q974": {"lbl": "Democratic Republic of the Congo", "lctn_lvl": "country",}, "Q35": {"lbl": "Denmark", "lctn_lvl": "country"}, "Q977": {"lbl": "Djibouti", "lctn_lvl": "country"}, "Q784": {"lbl": "Dominica", "lctn_lvl": "country"}, "Q786": {"lbl": "Dominican Republic", "lctn_lvl": "country"}, "Q574": {"lbl": "East Timor", "lctn_lvl": "country"}, "Q840601": {"lbl": "East Turkestan", "lctn_lvl": "region"}, "Q736": {"lbl": "Ecuador", "lctn_lvl": "country"}, "Q79": {"lbl": "Egypt", "lctn_lvl": "country"}, "Q792": {"lbl": "El Salvador", "lctn_lvl": "country"}, "Q983": {"lbl": "Equatorial Guinea", "lctn_lvl": "country"}, "Q986": {"lbl": "Eritrea", "lctn_lvl": "country"}, "Q191": {"lbl": "Estonia", "lctn_lvl": "country"}, "Q1050": {"lbl": "Eswatini", "lctn_lvl": "country"}, "Q115": {"lbl": "Ethiopia", "lctn_lvl": "country"}, "Q9648": {"lbl": "Falkland Islands", "lctn_lvl": "region"}, "Q4628": {"lbl": "Faroe Islands", "lctn_lvl": "region"}, "Q702": {"lbl": "Federated States of Micronesia", "lctn_lvl": "country",}, "Q712": {"lbl": "Fiji", "lctn_lvl": "country"}, "Q33": {"lbl": "Finland", "lctn_lvl": "country"}, "Q142": {"lbl": "France", "lctn_lvl": "country"}, "Q30971": {"lbl": "French Polynesia", "lctn_lvl": "region"}, "Q1000": {"lbl": "Gabon", "lctn_lvl": "country"}, "Q230": {"lbl": "Georgia", "lctn_lvl": "country"}, "Q183": {"lbl": "Germany", "lctn_lvl": "country"}, "Q117": {"lbl": "Ghana", "lctn_lvl": "country"}, "Q1410": {"lbl": "Gibraltar", "lctn_lvl": "region"}, "Q41": {"lbl": "Greece", "lctn_lvl": "country"}, "Q223": {"lbl": "Greenland", "lctn_lvl": "region"}, "Q769": {"lbl": "Grenada", "lctn_lvl": "country"}, "Q3118683": {"lbl": "Guadeloupe", "lctn_lvl": "region"}, "Q16635": {"lbl": "Guam", "lctn_lvl": "region"}, "Q774": {"lbl": "Guatemala", "lctn_lvl": "country"}, "Q25230": {"lbl": "Guernsey", "lctn_lvl": "region"}, "Q1006": {"lbl": "Guinea", "lctn_lvl": "country"}, "Q1007": {"lbl": "Guinea-Bissau", "lctn_lvl": "country"}, "Q734": {"lbl": "Guyana", "lctn_lvl": "country"}, "Q790": {"lbl": "Haiti", "lctn_lvl": "country"}, "Q783": {"lbl": "Honduras", "lctn_lvl": "country"}, "Q8646": {"lbl": "Hong Kong", "lctn_lvl": "region"}, "Q28": {"lbl": "Hungary", "lctn_lvl": "country"}, "Q189": {"lbl": "Iceland", "lctn_lvl": "country"}, "Q668": {"lbl": "India", "lctn_lvl": "country"}, "Q252": {"lbl": "Indonesia", "lctn_lvl": "country"}, "Q794": {"lbl": "Iran", "lctn_lvl": "country"}, "Q796": {"lbl": "Iraq", "lctn_lvl": "country"}, "Q27": {"lbl": "Ireland", "lctn_lvl": "country"}, "Q9676": {"lbl": "Isle of Man", "lctn_lvl": "region"}, "Q801": {"lbl": "Israel", "lctn_lvl": "country"}, "Q38": {"lbl": "Italy", "lctn_lvl": "country"}, "Q1008": {"lbl": "Ivory Coast", "lctn_lvl": "country"}, "Q766": {"lbl": "Jamaica", "lctn_lvl": "country"}, "Q17": {"lbl": "Japan", "lctn_lvl": "country"}, "Q785": {"lbl": "Jersey", "lctn_lvl": "region"}, "Q810": {"lbl": "Jordan", "lctn_lvl": "country"}, "Q232": {"lbl": "Kazakhstan", "lctn_lvl": "country"}, "Q114": {"lbl": "Kenya", "lctn_lvl": "country"}, "Q710": {"lbl": "Kiribati", "lctn_lvl": "country"}, "Q1246": {"lbl": "Kosovo", "lctn_lvl": "country"}, "Q817": {"lbl": "Kuwait", "lctn_lvl": "country"}, "Q813": {"lbl": "Kyrgyzstan", "lctn_lvl": "country"}, "Q819": {"lbl": "Laos", "lctn_lvl": "country"}, "Q211": {"lbl": "Latvia", "lctn_lvl": "country"}, "Q822": {"lbl": "Lebanon", "lctn_lvl": "country"}, "Q1013": {"lbl": "Lesotho", "lctn_lvl": "country"}, "Q1014": {"lbl": "Liberia", "lctn_lvl": "country"}, "Q1016": {"lbl": "Libya", "lctn_lvl": "country"}, "Q347": {"lbl": "Liechtenstein", "lctn_lvl": "country"}, "Q37": {"lbl": "Lithuania", "lctn_lvl": "country"}, "Q32": {"lbl": "Luxembourg", "lctn_lvl": "country"}, "Q1019": {"lbl": "Madagascar", "lctn_lvl": "country"}, "Q26253": {"lbl": "Madeira", "lctn_lvl": "region"}, "Q1020": {"lbl": "Malawi", "lctn_lvl": "country"}, "Q833": {"lbl": "Malaysia", "lctn_lvl": "country"}, "Q826": {"lbl": "Maldives", "lctn_lvl": "country"}, "Q912": {"lbl": "Mali", "lctn_lvl": "country"}, "Q233": {"lbl": "Malta", "lctn_lvl": "country"}, "Q709": {"lbl": "Marshall Islands", "lctn_lvl": "country"}, "Q17054": {"lbl": "Martinique", "lctn_lvl": "region"}, "Q1025": {"lbl": "Mauritania", "lctn_lvl": "country"}, "Q1027": {"lbl": "Mauritius", "lctn_lvl": "country"}, "Q17063": {"lbl": "Mayotte", "lctn_lvl": "region"}, "Q96": {"lbl": "Mexico", "lctn_lvl": "country"}, "Q217": {"lbl": "Moldova", "lctn_lvl": "country"}, "Q235": {"lbl": "Monaco", "lctn_lvl": "country"}, "Q711": {"lbl": "Mongolia", "lctn_lvl": "country"}, "Q236": {"lbl": "Montenegro", "lctn_lvl": "country"}, "Q13353": {"lbl": "Montserrat", "lctn_lvl": "region"}, "Q1028": {"lbl": "Morocco", "lctn_lvl": "country"}, "Q1029": {"lbl": "Mozambique", "lctn_lvl": "country"}, "Q836": {"lbl": "Myanmar", "lctn_lvl": "country"}, "Q1030": {"lbl": "Namibia", "lctn_lvl": "country"}, "Q697": {"lbl": "Nauru", "lctn_lvl": "country"}, "Q25359": {"lbl": "Navassa Island", "lctn_lvl": "region"}, "Q837": {"lbl": "Nepal", "lctn_lvl": "country"}, "Q55": {"lbl": "Netherlands", "lctn_lvl": "country"}, "Q33788": {"lbl": "New Caledonia", "lctn_lvl": "region"}, "Q664": {"lbl": "New Zealand", "lctn_lvl": "country"}, "Q811": {"lbl": "Nicaragua", "lctn_lvl": "country"}, "Q1032": {"lbl": "Niger", "lctn_lvl": "country"}, "Q1033": {"lbl": "Nigeria", "lctn_lvl": "country"}, "Q34020": {"lbl": "Niue", "lctn_lvl": "country"}, "Q31057": {"lbl": "Norfolk Island", "lctn_lvl": "region"}, "Q423": {"lbl": "North Korea", "lctn_lvl": "country"}, "Q221": {"lbl": "North Macedonia", "lctn_lvl": "country"}, "Q16644": {"lbl": "Northern Mariana Islands", "lctn_lvl": "region"}, "Q20": {"lbl": "Norway", "lctn_lvl": "country"}, "Q842": {"lbl": "Oman", "lctn_lvl": "country"}, "Q843": {"lbl": "Pakistan", "lctn_lvl": "country"}, "Q695": {"lbl": "Palau", "lctn_lvl": "country"}, "Q219060": {"lbl": "State of Palestine", "lctn_lvl": "country"}, "Q804": {"lbl": "Panama", "lctn_lvl": "country"}, "Q691": {"lbl": "Papua New Guinea", "lctn_lvl": "country"}, "Q733": {"lbl": "Paraguay", "lctn_lvl": "country"}, "Q419": {"lbl": "Peru", "lctn_lvl": "country"}, "Q928": {"lbl": "Philippines", "lctn_lvl": "country"}, "Q35672": {"lbl": "Pitcairn Islands", "lctn_lvl": "region"}, "Q36": {"lbl": "Poland", "lctn_lvl": "country"}, "Q45": {"lbl": "Portugal", "lctn_lvl": "country"}, "Q1183": {"lbl": "Puerto Rico", "lctn_lvl": "region"}, "Q846": {"lbl": "Qatar", "lctn_lvl": "country"}, "Q244165": {"lbl": "Republic of Artsakh", "lctn_lvl": "region"}, "Q971": {"lbl": "Republic of the Congo", "lctn_lvl": "country"}, "Q218": {"lbl": "Romania", "lctn_lvl": "country"}, "Q159": {"lbl": "Russia", "lctn_lvl": "country"}, "Q1037": {"lbl": "Rwanda", "lctn_lvl": "country"}, "Q25528": {"lbl": "Saba", "lctn_lvl": "region"}, "Q25362": {"lbl": "Saint Barthélemy", "lctn_lvl": "region"}, "Q192184": { "lbl": "Saint Helena, Ascension and Tristan da Cunha", "lctn_lvl": "region", }, "Q763": {"lbl": "Saint Kitts and Nevis", "lctn_lvl": "country"}, "Q760": {"lbl": "Saint Lucia", "lctn_lvl": "country"}, "Q25596": {"lbl": "Saint Martin", "lctn_lvl": "region"}, "Q34617": {"lbl": "Saint Pierre and Miquelon", "lctn_lvl": "region"}, "Q757": {"lbl": "Saint Vincent and the Grenadines", "lctn_lvl": "country",}, "Q683": {"lbl": "Samoa", "lctn_lvl": "country"}, "Q238": {"lbl": "San Marino", "lctn_lvl": "country"}, "Q851": {"lbl": "Saudi Arabia", "lctn_lvl": "country"}, "Q1041": {"lbl": "Senegal", "lctn_lvl": "country"}, "Q403": {"lbl": "Serbia", "lctn_lvl": "country"}, "Q1042": {"lbl": "Seychelles", "lctn_lvl": "country"}, "Q1044": {"lbl": "Sierra Leone", "lctn_lvl": "country"}, "Q334": {"lbl": "Singapore", "lctn_lvl": "country"}, "Q26180": {"lbl": "Sint Eustatius", "lctn_lvl": "region"}, "Q26273": {"lbl": "Sint Maarten", "lctn_lvl": "region"}, "Q214": {"lbl": "Slovakia", "lctn_lvl": "country"}, "Q215": {"lbl": "Slovenia", "lctn_lvl": "country"}, "Q685": {"lbl": "Solomon Islands", "lctn_lvl": "country"}, "Q1045": {"lbl": "Somalia", "lctn_lvl": "country"}, "Q258": {"lbl": "South Africa", "lctn_lvl": "country"}, "Q35086": { "lbl": "South Georgia and the South Sandwich Islands", "lctn_lvl": "region", }, "Q884": {"lbl": "South Korea", "lctn_lvl": "country"}, "Q23427": {"lbl": "South Ossetia", "lctn_lvl": "region"}, "Q958": {"lbl": "South Sudan", "lctn_lvl": "country"}, "Q29": {"lbl": "Spain", "lctn_lvl": "country"}, "Q854": {"lbl": "Sri Lanka", "lctn_lvl": "country"}, "Q1049": {"lbl": "Sudan", "lctn_lvl": "country"}, "Q730": {"lbl": "Suriname", "lctn_lvl": "country"}, "Q34": {"lbl": "Sweden", "lctn_lvl": "country"}, "Q39": {"lbl": "Switzerland", "lctn_lvl": "country"}, "Q858": {"lbl": "Syria", "lctn_lvl": "country"}, "Q1039": {"lbl": "São Tomé and Príncipe", "lctn_lvl": "country"}, "Q865": {"lbl": "Taiwan", "lctn_lvl": "country"}, "Q863": {"lbl": "Tajikistan", "lctn_lvl": "country"}, "Q924": {"lbl": "Tanzania", "lctn_lvl": "country"}, "Q869": {"lbl": "Thailand", "lctn_lvl": "country"}, "Q778": {"lbl": "The Bahamas", "lctn_lvl": "country"}, "Q1005": {"lbl": "The Gambia", "lctn_lvl": "country"}, "Q17252": {"lbl": "Tibet", "lctn_lvl": "region"}, "Q945": {"lbl": "Togo", "lctn_lvl": "country"}, "Q36823": {"lbl": "Tokelau", "lctn_lvl": "region"}, "Q678": {"lbl": "Tonga", "lctn_lvl": "country"}, "Q907112": {"lbl": "Transnistria", "lctn_lvl": "region"}, "Q754": {"lbl": "Trinidad and Tobago", "lctn_lvl": "country"}, "Q948": {"lbl": "Tunisia", "lctn_lvl": "country"}, "Q43": {"lbl": "Turkey", "lctn_lvl": "country"}, "Q23681": {"lbl": "Turkish Republic of Northern Cyprus", "lctn_lvl": "region",}, "Q874": {"lbl": "Turkmenistan", "lctn_lvl": "country"}, "Q18221": {"lbl": "Turks and Caicos Islands", "lctn_lvl": "region"}, "Q672": {"lbl": "Tuvalu", "lctn_lvl": "country"}, "Q1036": {"lbl": "Uganda", "lctn_lvl": "country"}, "Q212": {"lbl": "Ukraine", "lctn_lvl": "country"}, "Q878": {"lbl": "United Arab Emirates", "lctn_lvl": "country"}, "Q145": {"lbl": "United Kingdom", "lctn_lvl": "country"}, "Q30": {"lbl": "United States", "lctn_lvl": "country"}, "Q11703": {"lbl": "United States Virgin Islands", "lctn_lvl": "region",}, "Q77": {"lbl": "Uruguay", "lctn_lvl": "country"}, "Q265": {"lbl": "Uzbekistan", "lctn_lvl": "country"}, "Q686": {"lbl": "Vanuatu", "lctn_lvl": "country"}, "Q237": {"lbl": "Vatican City", "lctn_lvl": "country"}, "Q717": {"lbl": "Venezuela", "lctn_lvl": "country"}, "Q881": {"lbl": "Vietnam", "lctn_lvl": "country"}, "Q35555": {"lbl": "Wallis and Futuna", "lctn_lvl": "region"}, "Q6250": {"lbl": "Western Sahara", "lctn_lvl": "region"}, "Q805": {"lbl": "Yemen", "lctn_lvl": "country"}, "Q953": {"lbl": "Zambia", "lctn_lvl": "country"}, "Q954": {"lbl": "Zimbabwe", "lctn_lvl": "country"}, }
[docs]def incl_lctn_lbls(lctn_lvls=False): """ Queries the included location labels. Parameters ---------- lctn_lvls : str or list (contains strs) The level(s) of location to be queried. Returns ------- incl_lctns : list (contains strs) The Wikidata labels corresponding to the provided location level(s). """ lctn_lvls = utils._make_var_list(lctn_lvls)[0] valid_args = ["world", "continent", "country", "region"] assert sorted(list(set(valid_args) | set(lctn_lvls))) == sorted(valid_args), ( "Invalid levels were provided to the 'lctn_lvls' argument. Valid options are: " + ", ".join(valid_args) + "." ) incl_lctns = [] if "world" in lctn_lvls: incl_lctns.append( [ lctn for lctn in lctn_to_qid_dict().keys() if qid_to_lctn_dict()[lctn_to_qid_dict()[lctn]]["lctn_lvl"] == "world" ] ) if "continent" in lctn_lvls: incl_lctns.append( [ lctn for lctn in lctn_to_qid_dict().keys() if qid_to_lctn_dict()[lctn_to_qid_dict()[lctn]]["lctn_lvl"] == "continent" ] ) if "country" in lctn_lvls: incl_lctns.append( [ lctn for lctn in lctn_to_qid_dict().keys() if qid_to_lctn_dict()[lctn_to_qid_dict()[lctn]]["lctn_lvl"] == "country" ] ) if "region" in lctn_lvls: incl_lctns.append( [ lctn for lctn in lctn_to_qid_dict().keys() if qid_to_lctn_dict()[lctn_to_qid_dict()[lctn]]["lctn_lvl"] == "region" ] ) return list(flatten(incl_lctns))
[docs]def incl_lctn_ids(): """ Queries the included location ids. """ return list(lctn_to_qid_dict().values())
[docs]def lctn_lbl_to_qid(locations): """ Returns the Wikidata QID for given location(s). Parameters ---------- locations : str or list (contains strs) The label(s) of location(s) to be converted. Returns ------- wd_qids : list (contains strs) The Wikidata QIDs corresponding to the provided location label(s). """ locations, was_str_bool = utils._make_var_list(locations) locations = utils.check_str_args( arguments=locations, valid_args=lctn_to_qid_dict().keys() ) wd_qids = [lctn_to_qid_dict()[lctn] for lctn in locations] return utils._return_given_type(var=wd_qids, var_was_str=was_str_bool)
def qid_to_lctn_lbl(qids): """ Returns the Wikidata label for given QID(s). Parameters ---------- qids : str or list (contains strs) The QID(s) of location(s) to be converted. Returns ------- wd_lbls : list (contains strs) The Wikidata labels corresponding to the provided location qid(s). """ qids, was_str_bool = utils._make_var_list(qids) for q in qids: if q not in qid_to_lctn_dict().keys(): print(f"{q} is not a QID that a label can be directly queried for.") print("Use wd_utils.get_lbl() to load the entity and get its label.") qids.pop(q) wd_lbls = [qid_to_lctn_dict()[q]["lbl"] for q in qids] return utils._return_given_type(var=wd_lbls, var_was_str=was_str_bool)
[docs]def depth_to_col_name(depth): """ Derives the proper name of the column for locations given a depth. """ if depth == 0: return "location" else: return "sub_" * depth + "lctn"
[docs]def depth_to_qid_col_name(depth): """ Derives the proper name of the column for qids for depth based assignment. """ return "sub_" * depth + "qid"
[docs]def depth_to_qid_cols(depth): """ Derives the proper name of the column for qids for depth based assignment. """ return ["sub_" * d + "qid" for d in range(depth + 1)]
def depth_to_cols(depth): """ Derives a list of locational columns for data_utils.gen_base_df given a depth. """ return list( flatten( ["location"] + [["sub_" * d + "lctn"] for d in range(depth + 1) if d > 0] ) )
[docs]def find_qid_get_depth(lctns_dict, c=0): """ Finds all QIDs and gets their depths. """ if isinstance(lctns_dict, dict): for k, v in lctns_dict.items(): if wd_utils.is_wd_id(k): yield (k, int(c / 2)) if isinstance(v, dict): yield from find_qid_get_depth(v, c + 1)
[docs]def get_qids_at_depth(lctns_dict, depth=None): """ Finds all QIDs at a given depth of a LocationsDict. """ all_qid_depths = list(find_qid_get_depth(lctns_dict=lctns_dict, c=0)) return [q[0] for q in all_qid_depths if q[1] == depth]
[docs]def iter_set_dict(dictionary, key, sub_key, value): """ Iterates until a key is found, and then sets the value (potentially given a sub_key). """ for k, v in dictionary.items(): # pylint: disable=unused-variable if k == key: if sub_key is None: dictionary[k] = value else: dictionary[k][sub_key] = value elif isinstance(dictionary[k], dict): iter_set_dict(dictionary[k], key, sub_key, value) elif isinstance(dictionary[k], str): pass
[docs]def gen_lctns_dict( ents_dict=None, locations=None, depth=0, sub_lctns=True, timespan=None, interval=None, # multicore=True, verbose=True, ): """ Generates a dictionary of locations indexed by QIDs. Notes ----- 'P150' (contains administrative territorial entity) is used to subset. Parameters ---------- ents_dict : wd_utils.EntitiesDict : optional (default=None) A dictionary with keys being Wikidata QIDs and values being their entities. locations : str or list (contains strs) : optional (default=None) The name of a location or list of location names. depth : int (default=0, no sub_locations) The depth from the given lbls or qids that data should go. Note: this uses 'P150' (contains administrative territorial entity). sub_lctns : str or list (contains strs) : optional (default=None) sub_locations to subset by or not subset by adding '~' as the first character. timespan : two element tuple or list : contains datetime.date or tuple (default=None: (date.today(), date.today())) A tuple or list that defines the start and end dates to be queried. Note 1: if True, then the full timespan from 1-1-1 to the current day will be queried. Note 2: passing a single entry will query for that date only. interval : str The time interval over which queries will be made. Note 1: see data.time_utils for options. Note 2: if None, then only the most recent data will be queried. verbose : bool (default=True) Whether to show a tqdm progress bar for the creation of the dictionary. Potential later arguments: multicore : bool or int (default=False) Whether to make use of multiple processes and threads, and how many to use. Note: True uses all available cores. Returns ------- subs_dict : dict A dictionary of the given qids as keys and dictionaries of their subsidiaries by 'P150' as items. """ pid = "P150" lctns_dict = LocationsDict() current_depth = 0 def assign_first_iteration( ents_dict=None, lctns_dict=None, qids=None, # multicore=True, verbose=True, ): """ Assigns the first level of a LocationsDict. """ def get_first_iter_dict(ents_dict, lctns_dict, qid): lctns_dict[qid] = {"lbl": wd_utils.get_lbl(ents_dict, qid)} for q in tqdm( qids, desc="Depth 0 derived", total=len(qids), disable=not verbose ): get_first_iter_dict(ents_dict, lctns_dict, q) def assign_another_iteration( ents_dict=ents_dict, lctns_dict=lctns_dict, pid=None, depth=depth, current_depth=current_depth, sub_lctns=sub_lctns, timespan=timespan, interval=interval, # multicore=multicore, verbose=verbose, ): """ Assigns more layers to a LocationsDict after the first. """ sub_lctns = utils._make_var_list(sub_lctns)[0] depth_keys = get_qids_at_depth(lctns_dict=lctns_dict, depth=current_depth) if interval == None: # Assuming that the user wants the current sub-locations. def get_most_frequent_dict(ents_dict, lctns_dict, qid, pid): """ Returns sub-locations that don't have 'P582' (end time) or don't have qualifiers at all. """ if pid in wd_utils.load_ent(ents_dict, qid)["claims"].keys(): subs_info = { sub[0]: {"lbl": sub[1]} for sub in [ [ wd_utils.get_prop_id(ents_dict, qid, pid, i), wd_utils.get_prop_lbl(ents_dict, qid, pid, i), ] for i in range(len(wd_utils.get_prop(ents_dict, qid, pid))) if ( wd_utils.prop_has_qualifiers(ents_dict, qid, pid, i) and "P582" not in wd_utils.get_qualifiers( ents_dict, qid, pid, i ).keys() ) or not wd_utils.prop_has_qualifiers(ents_dict, qid, pid, i) ] if ( sub_lctns == True or (sub_lctns != True and sub[1] in sub_lctns) or ( sub_lctns != True and sub[1] not in [ lctn[1:] for lctn in sub_lctns if lctn[0] == "~" ] ) ) } else: wd_utils.print_not_available( ents_dict=ents_dict, qid=qid, pid=pid, extra_msg=" to derive sub_lctns", ) subs_info = {} iter_set_dict( dictionary=lctns_dict, key=qid, sub_key="sub_lctns", value=subs_info ) for q in tqdm( depth_keys, desc=f"Depth {current_depth + 1} derived", total=len(depth_keys), disable=not verbose, ): get_most_frequent_dict(ents_dict, lctns_dict, q, pid) else: # Find the included times and the timespan for each sub-location element. def get_valid_timespan_dict(ents_dict, lctns_dict, qid, pid): """ Returns the sub-location's id, lbl, and the valid timespan so it can be used in subsetting. """ if pid in wd_utils.load_ent(ents_dict, qid)["claims"].keys(): subs_info = { sub[0]: {"lbl": sub[1], "valid_timespan": sub[2]} for sub in [ [ wd_utils.get_prop_id(ents_dict, qid, pid, i), wd_utils.get_prop_lbl(ents_dict, qid, pid, i), wd_utils.get_prop_timespan_intersection( ents_dict, qid, pid, i, timespan, interval ), ] for i in range(len(wd_utils.get_prop(ents_dict, qid, pid))) ] if ( sub_lctns == True or (sub_lctns != True and sub[1] in sub_lctns) or ( sub_lctns != True and sub[1] not in [ lctn[1:] for lctn in sub_lctns if lctn[0] == "~" ] ) ) and sub[2] != None } else: wd_utils.print_not_available( ents_dict=ents_dict, qid=qid, pid=pid, extra_msg=" to derive sub_lctns", ) subs_info = {} iter_set_dict( dictionary=lctns_dict, key=qid, sub_key="sub_lctns", value=subs_info ) for q in tqdm( depth_keys, desc=f"Depth {current_depth + 1} derived", total=len(depth_keys), disable=not verbose, ): get_valid_timespan_dict(ents_dict, lctns_dict, q, pid) depth -= 1 current_depth += 1 return depth, current_depth locations = utils._make_var_list(locations)[0] qids = [ lctn_lbl_to_qid(lctn) if not wd_utils.is_wd_id(lctn) else lctn for lctn in locations ] if ents_dict == None: ents_dict = wd_utils.EntitiesDict() assign_first_iteration( ents_dict=ents_dict, lctns_dict=lctns_dict, qids=qids, # multicore=multicore, verbose=verbose, ) while depth > 0: depth, current_depth = assign_another_iteration( ents_dict=ents_dict, lctns_dict=lctns_dict, pid=pid, depth=depth, current_depth=current_depth, sub_lctns=sub_lctns, timespan=timespan, interval=interval, # multicore=multicore, verbose=verbose, ) return lctns_dict
[docs]def derive_depth(a_dict, depth=0): """ Derives the depth of a LocationsDict. """ if a_dict != {}: # an empty sub_lctn if "sub_lctns" in a_dict[list(a_dict.keys())[0]].keys(): depth += 1 return derive_depth( a_dict[list(a_dict.keys())[0]]["sub_lctns"], depth=depth ) else: return depth else: return depth
[docs]def merge_lctn_dicts(ld1, ld2): """ Merges two location dictionaries conditionally on them having the same depth. """ assert isinstance(ld1, LocationsDict) and isinstance( ld2, LocationsDict ), "This merge is valid only for LocationsDict objects." depth_1 = derive_depth(ld1) depth_2 = derive_depth(ld2) if depth_1 == depth_2: for k in ld2.keys(): ld1[k] = ld2[k] return ld1 else: ValueError("Two LocationsDict objects of different depth have been passed.")
def iter_key_items(node, kv): """ Finds the items of a nested dictionary key. """ if isinstance(node, list): for i in node: yield from iter_key_items(i, kv) elif isinstance(node, dict): if kv in node: yield node[kv] for j in node.values(): yield from iter_key_items(j, kv)
[docs]class LocationsDict(dict): """ A dictionary for storing WikiData locations. Notes ----- Keywords are QIDs, and values are dictionaries of depth, interval, and timespan specific information. """ __slots__ = () def __init__(self, *args, **kwargs): super(LocationsDict, self).__init__(*args, **kwargs) def __repr__(self): return "%s" % self.__class__ def __str__(self): return """ The LocationsDict class is meant to store information needed to query locations - Keys are QIDs - Values are dictionaries containing: - lbl: the location's label - valid_timespan: the timespan that the location is valid for given a interval and timespan - sub_lctns: equivalent dictionary objects for sub-locations given a depth Note: the last two are optional Because of the potential size, print() has been disabled. All other dictionary methods are included, as well as: key_lbls_list - a list of all location labels get_depth - the depth iter_key_items - the items within a key iter_set - finds and sets a key get_qids_at_depth - finds all QIDs at a given depth key_lbls_at_depth - the key labels at a given depth _print - prints the full LocationsDict """
[docs] def key_lbls_list(self): """ Provides a list of all location labels in the LocationsDict. """ return list(iter_key_items(self, "lbl"))
# def key_lbls_dict(self): # """ # Provides a dict of location labels in the LocationsDict. # """ # return # a dictionary created by a recursive lookup of those elements that have the key 'lbl'
[docs] def get_depth(self): """ The depth of the LocationsDict. """ return derive_depth(self, depth=0)
# def get_interval(self): # """ # The interval of the LocationsDict. # """ # return self['interval'] # def get_timespan(self): # """ # The timespan of the LocationsDict. # """ # return self['timespan'] # def get_lctn_dict_specs(self, depth, timespan, interval): # """ # Combines get_depth, get_interval, and get_timespan of a LocationsDict. # """ # if depth == None: # depth = self.get_depth() # if interval == None: # interval = self.get_interval() # if timespan == None: # timespan = self.get_timespan() # return depth, timespan, interval
[docs] def iter_key_items(self, kv): """ The items within a key in LocationsDict. """ return iter_key_items(self, kv)
[docs] def iter_set(self, key, sub_key, value): """ Finds and sets a key in LocationsDict. """ return iter_set_dict(self, key, sub_key, value)
[docs] def get_qids_at_depth(self, depth=None): """ Finds all QIDs at a given depth of a LocationsDict. """ return get_qids_at_depth(self, depth=depth)
[docs] def key_lbls_at_depth(self, ents_dict, depth): """ Provides a list of key labels at a given depth. """ qids_at_depth = get_qids_at_depth(self, depth=depth) return [wd_utils.get_lbl(ents_dict=ents_dict, pq_id=q) for q in qids_at_depth]
[docs] def _print(self): """ Prints the full LocationsDict. """ return {k: v for k, v in self.items()}