Source code for wikirepo.data.query

"""
Query
-----

A function that calls and combines data from Wikidata.

Note: the purpose of this module is for a wikirepo.data.query() function call.

Contents
    query
"""

from ast import literal_eval

# import modin.pandas as pd
import pandas as pd
from tqdm.auto import tqdm
from wikirepo import utils
from wikirepo.data import data_utils, lctn_utils, time_utils, wd_utils


[docs]def query(
    ents_dict=None,
    locations=None,
    depth=None,
    timespan=None,
    interval=None,
    climate_props=None,
    demographic_props=None,
    economic_props=None,
    electoral_poll_props=None,
    electoral_result_props=None,
    geographic_props=None,
    institutional_props=None,
    political_props=None,
    misc_props=None,
    #   multicore=True,
    verbose=True,
):
    """
    Queries Wikidata properties based on module arguments for locations given a depth, interval, and timespan.

    Parameters
    ----------
        ents_dict : wd_utils.EntitiesDict : optional (default=None)
            A dictionary with keys being Wikidata QIDs and values being their entities.

        locations : str, list, or lctn_utils.LocationsDict (contains strs) : optional (default=None)
            The locations to query either as strings for indexed locations or Wikidata QIDs.

        depth : int (default=0, no sub_locations)
            The depth from the given lbls or qids that data should go.

            Note: this uses 'P150' (contains administrative territorial entity).

        timespan : two element tuple or list : contains datetime.date or tuple (default=None: (date.today(), date.today())).
            A tuple or list that defines the start and end dates to be queried.

            Note 1: if None, then only the most recent data for the interval will be queried.

            Note 2: if True, then the full timespan from 1-1-1 to the current day will be queried.

            Note 3: passing a single entry will query for that date only.

        interval : str (default=None)
            The time interval over which queries will be made.

            Note 1: see data.time_utils for options.

            Note 2: if None, then only the most recent data will be queried.

        climate_props : str or list (contains strs) : optional (default=None)
            String representations of data/climate modules for data_utils.query_repo_dir.

        demographic_props : str or list (contains strs) : optional (default=None)
            String representations of data/demographic modules for data_utils.query_repo_dir.

        economic_props : str or list (contains strs) : optional (default=None)
            String representations of data/economic modules for data_utils.query_repo_dir.

        electoral_poll_props : str or list (contains strs) : optional (default=None)
            String representations of data/electoral_polls modules for data_utils.query_repo_dir.

        electoral_result_props : str or list (contains strs) : optional (default=None)
            String representations of data/electoral_results modules for data_utils.query_repo_dir.

        geographic_props : str or list (contains strs) : optional (default=None)
            String representations of data/geographic modules for data_utils.query_repo_dir.

        institutional_props : str or list (contains strs) : optional (default=None)
            String representations of data/institutional modules for data_utils.query_repo_dir.

        political_props : str or list (contains strs) : optional (default=None)
            String representations of data/political modules for data_utils.query_repo_dir.

        misc_props : str or list (contains strs) : optional (default=None)
            String representations of data/misc (miscellaneous) modules for data_utils.query_repo_dir.

        verbose : bool (default=True)
            Whether to show a tqdm progress bar for the query
            Note: passing 'full' calls progress bars for each data_utils.query_repo_dir.

        Potential later arguments:
            multicore : bool or int (default=False)
                Whether to make use of multiple processes and threads, and how many to use
                Note: True uses all available.

            source : bool (default=False)
                Whether to add columns for sources for all data.

    Returns
    -------
        df_merge : pd.DataFrame
            A df of locations and data given timespan and data source arguments.
    """
    local_args = locals()

    # Baseline args that do not have imbedded lower level functional arguments.
    # These are passed directly.
    baseline_args = [
        "ents_dict",
        "locations",
        "depth",
        "timespan",
        "interval",
        "verbose",
    ]

    if isinstance(locations, lctn_utils.LocationsDict):
        if depth == None:
            depth = locations.get_depth()
        # if interval == None:
        #     interval = locations.get_interval()
        # if timespan == None:
        #     timespan = locations.get_timespan()

    query_args = [
        arg
        for arg in local_args.keys()
        if (arg not in baseline_args)
        and (local_args[arg] != None and local_args[arg] != False)
    ]

    # Initialize a merge df, a dictionary of parameters, and an entities dictionary.
    df_merge = None
    query_params = {}
    if ents_dict == None:
        ents_dict = wd_utils.EntitiesDict()

    if isinstance(locations, str):
        locations = [locations]

    for arg in tqdm(
        query_args, desc="Directories queried", unit="dir", disable=not verbose
    ):
        sub_directory = arg[: -len("_props")]

        if sub_directory == "electoral_poll" or sub_directory == "electoral_result":
            sub_directory += "s"

        query_params["ents_dict"] = literal_eval(str(ents_dict._print()))
        query_params["dir_name"] = sub_directory
        if isinstance(locations, lctn_utils.LocationsDict):
            query_params["locations"] = literal_eval(str(locations._print()))
        else:
            query_params["locations"] = locations
        query_params["depth"] = depth

        # The following is necessary for passing tuples with datetime.date
        # objects to literal_eval.
        # Convert to a tuple of tuples, and then back again in the lower
        # functions via time_utils.make_timespan() in data_utils.gen_base_df().
        timespan = f"{timespan}".replace("datetime.date", "")
        timespan = literal_eval(timespan)
        query_params["timespan"] = timespan
        query_params["interval"] = interval

        if verbose == "full":
            query_params["verbose"] = True
        elif verbose == True:
            query_params["verbose"] = False
        else:
            query_params["verbose"] = False

        # Included indexes for the given data type.
        incl_indexes = data_utils.incl_dir_idxs(dir_name=sub_directory)

        # Assigning True for the specific data indexes to be queried, which is passed to data_utils.query_repo_dir.
        query_arg_indexes = local_args[arg]
        if query_arg_indexes == True:
            for i in incl_indexes:
                query_params[i] = True

        else:
            if isinstance(query_arg_indexes, str):
                query_arg_indexes = [query_arg_indexes]
            for i in query_arg_indexes:
                if i in incl_indexes:
                    query_params[i] = True
                else:
                    utils.check_str_args(arguments=i, valid_args=incl_indexes)

        # Pass the created dictionary as kwargs for data_utils.query_repo_dir.
        if df_merge is not None:
            # geo cols are queried as a list, and time as a string.
            if interval is not None:
                merge_on = lctn_utils.depth_to_cols(depth=depth) + [
                    time_utils.interval_to_col_name(interval=interval)
                ]
            else:
                merge_on = lctn_utils.depth_to_cols(depth=depth)

            df_dir_props, new_ents_dict = data_utils.query_repo_dir(
                **literal_eval(str(query_params))
            )

            df_merge = pd.merge(df_merge, df_dir_props, on=merge_on)

        else:
            df_merge, new_ents_dict = data_utils.query_repo_dir(
                **literal_eval(str(query_params))
            )

        for i in incl_indexes:
            query_params.pop(i, None)

        for k in new_ents_dict.keys():
            if k not in ents_dict.keys():
                ents_dict[k] = new_ents_dict[k]

    # Reduce QID columns to just one directly after the last locations column.
    qid_cols = [col for col in list(df_merge.columns) if col[: len("qid")] == "qid"]
    first_qid_col = qid_cols[0]
    df_merge.rename(columns={first_qid_col: "keep_this_col"}, inplace=True)
    df_merge = df_merge.loc[
        :, ~df_merge.columns.duplicated()
    ]  # qid columns can be named the same.
    qid_cols = [col for col in list(df_merge.columns) if col[: len("qid")] == "qid"]
    for col in list(set(qid_cols)):
        df_merge.drop(col, axis=1, inplace=True)

    df_merge.rename(columns={"keep_this_col": "qid"}, inplace=True)

    return df_merge