Skip to content

Item quality module

summarize_matches()

Return a simple summary table for item matches regardless of category

Returns a summary table in the following structure

n_unique_matches is the number of uniquely named input values that matched perc_matched is the percentage of uniquely named input values that matched n_item_matches is the number of unique items they matched against, including duplicate concepts

Parameters:

Name Type Description Default
dfs_list list(DataFrame

A dataframe list of the histology related reconciled tables (cells, tissues and organs)

required
total_list list(int

A list of the number of original input values.

required

Returns:

Name Type Description
DataFrame

A summary table for the matches.

Source code in wikidata_panglaodb/quality.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def summarize_matches(dfs_list, total_list):
    """Return a simple summary table for item matches regardless of category

    Returns a summary table in the following structure:
        n_unique_matches is the number of uniquely named input values that matched
        perc_matched is the percentage of uniquely named input values that matched
        n_item_matches is the number of unique items they matched against, including duplicate concepts


    Args:
        dfs_list (list(DataFrame)): A dataframe list of the histology related reconciled tables 
            (cells, tissues and organs)
        total_list (list(int)): A list of the number of original input values.

    Returns:
        DataFrame: A summary table for the matches.

    """

    summary_table = pd.DataFrame(
        {
            "n_unique_matches": [df["input_value"].nunique() for df in dfs_list],
            "n_item_matches": [df["id"].nunique() for df in dfs_list],
            "totals": total_list,
        }
    )

    summary_table["perc_matched"] = (
        summary_table["n_unique_matches"] / summary_table["totals"]
    ) * 100

    return summary_table

summarize_histology()

Return a summary table for item matches in the histology category

Returns a summary table in the following structure

n_unique_matches is the number of uniquely named input values that matched perc_matched is the percentage of uniquely named input values that matched n_item_matches is the number of unique items they matched against, including duplicate concepts how_many_perfect_matches is the percentage of items matched that got a perfect 100.0 score in the reconciliation how_many_no_p31 is the percentage of items matched that don't have an 'instance of' property

Parameters:

Name Type Description Default
dfs_list list(DataFrame

A dataframe list of the histology related reconciled tables (cells, tissues and organs)

required
total_list list(int

A list of the number of original input values.

required

Returns:

Name Type Description
DataFrame

A summary table for the matches.

Source code in wikidata_panglaodb/quality.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def summarize_histology(dfs_list, total_list):
    """Return a summary table for item matches in the histology category

    Returns a summary table in the following structure:
        n_unique_matches is the number of uniquely named input values that matched
        perc_matched is the percentage of uniquely named input values that matched
        n_item_matches is the number of unique items they matched against, including duplicate concepts
        how_many_perfect_matches is the percentage of items matched that got
            a perfect 100.0 score in the reconciliation
        how_many_no_p31 is the percentage of items matched that don't have an 'instance of' property

    Args:
        dfs_list (list(DataFrame)): A dataframe list of the histology related reconciled tables 
            (cells, tissues and organs)
        total_list (list(int)): A list of the number of original input values.

    Returns:
        DataFrame: A summary table for the matches.
    """

    summary_table = summarize_matches(dfs_list, total_list)

    summary_table["n_perfect_matches"] = [
        len(df[df["score"] == 100.0]) for df in dfs_list
    ]
    summary_table["n_no_p31"] = [len(df[df["type"] == "[]"]) for df in dfs_list]

    summary_table["how_many_perfect_matches"] = (
        summary_table["n_perfect_matches"] / summary_table["n_item_matches"]
    ) * 100
    summary_table["how_many_no_p31"] = (
        summary_table["n_no_p31"] / summary_table["n_item_matches"]
    ) * 100

    summary_table.drop(
        ["n_no_p31", "n_perfect_matches", "totals"], axis=1, inplace=True
    )

    return summary_table

get_number_of_statements_for_items()

Return a pandas dataframe of items and their number of statements

This function takes in a list of QIDs and uses the Wikibase API to return a table with the number of statements each item has.

Parameters:

Name Type Description Default
qid_list list

A list containing the QIDs you want to analyse.

required

Returns:

Name Type Description
DataFrame

A dataframe of two columns, one for the input QIDs, another with the number of statements for each QID.

Source code in wikidata_panglaodb/quality.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def get_number_of_statements_for_items(qid_list, has_property):
    """Return a pandas dataframe of items and their number of statements

    This function takes in a list of QIDs and uses the Wikibase API to return
    a table with the number of statements each item has.

    Args:
        qid_list (list): A list containing the QIDs you want to analyse.

    Returns:
        DataFrame: A dataframe of two columns, one for the input QIDs, another with 
            the number of statements for each QID.
    """

    # API limits to 50 items at once
    item_quality_dict = defaultdict(list)

    for i in range(0, len(qid_list), 50):

        curr_ids = qid_list[i : (i + 50)]

        params = {
            "action": "wbgetentities",
            "ids": "|".join(curr_ids),
            "format": "json",
        }

        resp = requests.post("https://www.wikidata.org/w/api.php", data=params)

        query_result = resp.json()["entities"]

        for item in query_result:
            properties = query_result[item]["claims"]
            if has_property in properties:
                item_quality_dict[item] = [len(properties.keys()), True]
            else:
                item_quality_dict[item] = [len(properties.keys()), False]

    item_quality_table = (
        pd.DataFrame.from_dict(
            item_quality_dict, columns=["n_statements", "has_property"], orient="index"
        )
        .rename_axis("id")
        .reset_index()
    )

    return item_quality_table

get_genes_item_quality()

Changes ID columns to binary values

Changes the alternative id columns in the reconciled gene data to binary values, also drops unecessary columns if drop == True.

Parameters:

Name Type Description Default
df DataFrame

Reconciled gene data.

required
drop bool

Wether or not to drop the other columns.

True

Returns:

Name Type Description
DataFrame

Simplified dataframe for plotting.

Source code in wikidata_panglaodb/quality.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def get_genes_item_quality(df, drop=True):
    """Changes ID columns to binary values

    Changes the alternative id columns in the reconciled gene data to binary values,
    also drops unecessary columns if drop == True.

    Args:
        df (DataFrame): Reconciled gene data.
        drop (bool): Wether or not to drop the other columns.

    Returns:
        DataFrame: Simplified dataframe for plotting.

    """

    result_df = df.copy().rename(columns={"item": "id"})

    result_df["has_ensg"] = np.where(result_df["ensg_wdt"].isna(), False, True)
    result_df["has_entrez"] = np.where(result_df["entrez"].isna(), False, True)

    if drop == True:
        result_df.drop(
            ["itemLabel", "entrez", "ensg_wdt", "ensg_panglao"], axis=1, inplace=True
        )

    return result_df

aggregate_altID_data()

Summarize alternative IDs for final bar plot

Parameters:

Name Type Description Default
dataframe DataFrame

With either the histological or gene data.

required
group list

Columns to use for the groupby.

required

Returns:

Name Type Description
DataFrame

Summarized counts and percentages.

Source code in wikidata_panglaodb/quality.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def aggregate_altID_data(dataframe, group):
    """Summarize alternative IDs for final bar plot

    Args:
        dataframe (DataFrame): With either the histological or gene data.
        group (list): Columns to use for the groupby.

    Returns:
        DataFrame: Summarized counts and percentages.
    """
    aggregated = (
        dataframe.groupby(group)
        .agg(
            has_property=pd.NamedAgg("has_property", "sum"),
            total=pd.NamedAgg(group[0], "count"),
        )
        .reset_index()
    )

    aggregated["percentage_has_id"] = (
        aggregated["has_property"] / aggregated["total"]
    ) * 100

    return aggregated