# NOTE: This notebook uses the polars package
import numpy as np
from plotnine import *
import polars as pl
from polars import col
Rank Plot
Comparing a group of ranked items at two different times
Read the data.
Source: World Bank - Infanct Mortality Rate (per 1,000 live births)b
= pl.read_csv(
data "data/API_SP.DYN.IMRT.IN_DS2_en_csv_v2/API_SP.DYN.IMRT.IN_DS2_en_csv_v2.csv",
=4,
skip_rows="",
null_values=True,
truncate_ragged_lines
)
# Columns as valid python variables
= {c: f"y{c}" for c in data.columns if c[:2] in {"19", "20"}}
year_columns = data.rename(
data "Country Name": "country", "Country Code": "code", **year_columns}
{"Indicator Name", "Indicator Code"])
).drop([ data.head()
shape: (5, 60)
country | code | y1960 | y1961 | y1962 | y1963 | y1964 | y1965 | y1966 | y1967 | y1968 | y1969 | y1970 | y1971 | y1972 | y1973 | y1974 | y1975 | y1976 | y1977 | y1978 | y1979 | y1980 | y1981 | y1982 | y1983 | y1984 | y1985 | y1986 | y1987 | y1988 | y1989 | y1990 | y1991 | y1992 | y1993 | y1994 | y1995 | y1996 | y1997 | y1998 | y1999 | y2000 | y2001 | y2002 | y2003 | y2004 | y2005 | y2006 | y2007 | y2008 | y2009 | y2010 | y2011 | y2012 | y2013 | y2014 | y2015 | y2016 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str | str |
"Aruba" | "ABW" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
"Afghanistan" | "AFG" | null | 240.5 | 236.3 | 232.3 | 228.5 | 224.6 | 220.7 | 217.0 | 213.3 | 209.8 | 206.1 | 202.2 | 198.2 | 194.3 | 190.3 | 186.6 | 182.6 | 178.7 | 174.5 | 170.4 | 166.1 | 161.8 | 157.5 | 153.2 | 148.7 | 144.5 | 140.2 | 135.7 | 131.3 | 126.8 | 122.5 | 118.3 | 114.4 | 110.9 | 107.7 | 105.0 | 102.7 | 100.7 | 98.9 | 97.2 | 95.4 | 93.4 | 91.2 | 89.0 | 86.7 | 84.4 | 82.3 | 80.4 | 78.6 | 76.8 | 75.1 | 73.4 | 71.7 | 69.9 | 68.1 | 66.3 | null | null |
"Angola" | "AGO" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 138.3 | 137.5 | 136.8 | 136.0 | 135.3 | 134.9 | 134.4 | 134.1 | 133.8 | 133.6 | 133.5 | 133.5 | 133.5 | 133.4 | 133.2 | 132.8 | 132.3 | 131.5 | 130.6 | 129.5 | 128.3 | 126.9 | 125.5 | 124.1 | 122.8 | 121.2 | 119.4 | 117.1 | 114.7 | 112.2 | 109.6 | 106.8 | 104.1 | 101.4 | 98.8 | 96.0 | null | null |
"Albania" | "ALB" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 73.0 | 68.4 | 64.0 | 59.9 | 56.1 | 52.4 | 49.1 | 45.9 | 43.2 | 40.8 | 38.6 | 36.7 | 35.1 | 33.7 | 32.5 | 31.4 | 30.3 | 29.1 | 27.9 | 26.8 | 25.5 | 24.4 | 23.2 | 22.1 | 21.0 | 20.0 | 19.1 | 18.3 | 17.4 | 16.7 | 16.0 | 15.4 | 14.8 | 14.3 | 13.8 | 13.3 | 12.9 | 12.5 | null | null |
"Andorra" | "AND" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 7.5 | 7.0 | 6.5 | 6.1 | 5.6 | 5.2 | 5.0 | 4.6 | 4.3 | 4.1 | 3.9 | 3.7 | 3.5 | 3.3 | 3.2 | 3.1 | 2.9 | 2.8 | 2.7 | 2.6 | 2.5 | 2.4 | 2.3 | 2.2 | 2.1 | 2.1 | null | null |
The data includes regional aggregates. To tell apart the regional aggregates we need the metadata. Every row in the data table has a corresponding row in the metadata table. Where the row has regional aggregate data, the Region
column in the metadata table is NaN
.
def ordered_categorical(s, categories=None):
"""
Create a categorical ordered according to the categories
"""
= getattr(s, "name", "")
name if categories is None:
return pl.Series(name, s).cast(pl.Categorical)
with pl.StringCache():
pl.Series(categories).cast(pl.Categorical)return pl.Series(name, s).cast(pl.Categorical)
= {"Country Code": "code", "Region": "region", "IncomeGroup": "income_group"}
columns
= (
metadata
pl.scan_csv("data/API_SP.DYN.IMRT.IN_DS2_en_csv_v2/Metadata_Country_API_SP.DYN.IMRT.IN_DS2_en_csv_v2.csv"
)
.rename(columns)list(columns.values()))
.select(filter(
.# Drop the regional aggregate information
"region") != "") & (col("income_group") != "")
(col(
)
.collect()
)
= ["High income", "Upper middle income", "Lower middle income", "Low income"]
cat_order = metadata.with_columns(
metadata "income_group"], cat_order)
ordered_categorical(metadata[
)
10) metadata.head(
shape: (10, 3)
code | region | income_group |
---|---|---|
str | str | cat |
"ABW" | "Latin America & Caribbean" | "High income" |
"AFG" | "South Asia" | "Low income" |
"AGO" | "Sub-Saharan Africa" | "Lower middle income" |
"ALB" | "Europe & Central Asia" | "Upper middle income" |
"AND" | "Europe & Central Asia" | "High income" |
"ARE" | "Middle East & North Africa" | "High income" |
"ARG" | "Latin America & Caribbean" | "Upper middle income" |
"ARM" | "Europe & Central Asia" | "Lower middle income" |
"ASM" | "East Asia & Pacific" | "Upper middle income" |
"ATG" | "Latin America & Caribbean" | "High income" |
Remove the regional aggregates, to create a table with only country data
= data.join(metadata, on="code")
country_data country_data.head()
shape: (5, 62)
country | code | y1960 | y1961 | y1962 | y1963 | y1964 | y1965 | y1966 | y1967 | y1968 | y1969 | y1970 | y1971 | y1972 | y1973 | y1974 | y1975 | y1976 | y1977 | y1978 | y1979 | y1980 | y1981 | y1982 | y1983 | y1984 | y1985 | y1986 | y1987 | y1988 | y1989 | y1990 | y1991 | y1992 | y1993 | y1994 | y1995 | y1996 | y1997 | y1998 | y1999 | y2000 | y2001 | y2002 | y2003 | y2004 | y2005 | y2006 | y2007 | y2008 | y2009 | y2010 | y2011 | y2012 | y2013 | y2014 | y2015 | y2016 | region | income_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str | str | str | cat |
"Aruba" | "ABW" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | "Latin America & Caribbean" | "High income" |
"Afghanistan" | "AFG" | null | 240.5 | 236.3 | 232.3 | 228.5 | 224.6 | 220.7 | 217.0 | 213.3 | 209.8 | 206.1 | 202.2 | 198.2 | 194.3 | 190.3 | 186.6 | 182.6 | 178.7 | 174.5 | 170.4 | 166.1 | 161.8 | 157.5 | 153.2 | 148.7 | 144.5 | 140.2 | 135.7 | 131.3 | 126.8 | 122.5 | 118.3 | 114.4 | 110.9 | 107.7 | 105.0 | 102.7 | 100.7 | 98.9 | 97.2 | 95.4 | 93.4 | 91.2 | 89.0 | 86.7 | 84.4 | 82.3 | 80.4 | 78.6 | 76.8 | 75.1 | 73.4 | 71.7 | 69.9 | 68.1 | 66.3 | null | null | "South Asia" | "Low income" |
"Angola" | "AGO" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 138.3 | 137.5 | 136.8 | 136.0 | 135.3 | 134.9 | 134.4 | 134.1 | 133.8 | 133.6 | 133.5 | 133.5 | 133.5 | 133.4 | 133.2 | 132.8 | 132.3 | 131.5 | 130.6 | 129.5 | 128.3 | 126.9 | 125.5 | 124.1 | 122.8 | 121.2 | 119.4 | 117.1 | 114.7 | 112.2 | 109.6 | 106.8 | 104.1 | 101.4 | 98.8 | 96.0 | null | null | "Sub-Saharan Africa" | "Lower middle income" |
"Albania" | "ALB" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 73.0 | 68.4 | 64.0 | 59.9 | 56.1 | 52.4 | 49.1 | 45.9 | 43.2 | 40.8 | 38.6 | 36.7 | 35.1 | 33.7 | 32.5 | 31.4 | 30.3 | 29.1 | 27.9 | 26.8 | 25.5 | 24.4 | 23.2 | 22.1 | 21.0 | 20.0 | 19.1 | 18.3 | 17.4 | 16.7 | 16.0 | 15.4 | 14.8 | 14.3 | 13.8 | 13.3 | 12.9 | 12.5 | null | null | "Europe & Central Asia" | "Upper middle income" |
"Andorra" | "AND" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 7.5 | 7.0 | 6.5 | 6.1 | 5.6 | 5.2 | 5.0 | 4.6 | 4.3 | 4.1 | 3.9 | 3.7 | 3.5 | 3.3 | 3.2 | 3.1 | 2.9 | 2.8 | 2.7 | 2.6 | 2.5 | 2.4 | 2.3 | 2.2 | 2.1 | 2.1 | null | null | "Europe & Central Asia" | "High income" |
We are interested in the changes in rank between 1960 and 2015. To plot a reasonable sized graph, we randomly sample 35 countries.
= (
sampled_data =["y1960", "y2015"])
country_data.drop_nulls(subset=35, seed=123)
.sample(n
.with_columns(=col("y1960").rank(method="ordinal").cast(pl.Int64),
y1960_rank=col("y2015").rank(method="ordinal").cast(pl.Int64),
y2015_rank
)"y2015_rank", descending=True)
.sort(
)
sampled_data.head()
shape: (5, 64)
country | code | y1960 | y1961 | y1962 | y1963 | y1964 | y1965 | y1966 | y1967 | y1968 | y1969 | y1970 | y1971 | y1972 | y1973 | y1974 | y1975 | y1976 | y1977 | y1978 | y1979 | y1980 | y1981 | y1982 | y1983 | y1984 | y1985 | y1986 | y1987 | y1988 | y1989 | y1990 | y1991 | y1992 | y1993 | y1994 | y1995 | y1996 | y1997 | y1998 | y1999 | y2000 | y2001 | y2002 | y2003 | y2004 | y2005 | y2006 | y2007 | y2008 | y2009 | y2010 | y2011 | y2012 | y2013 | y2014 | y2015 | y2016 | region | income_group | y1960_rank | y2015_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str | str | str | cat | i64 | i64 |
"Sierra Leone" | "SLE" | 223.6 | 220.5 | 217.5 | 214.2 | 211.0 | 207.6 | 204.2 | 200.8 | 197.3 | 194.1 | 191.0 | 188.0 | 185.2 | 182.6 | 180.0 | 177.5 | 175.3 | 173.2 | 171.2 | 169.2 | 167.3 | 165.6 | 164.1 | 162.8 | 161.5 | 160.4 | 159.4 | 158.3 | 157.6 | 157.0 | 156.5 | 156.1 | 155.7 | 155.2 | 154.5 | 153.4 | 152.0 | 150.1 | 148.1 | 145.8 | 143.3 | 140.5 | 137.7 | 134.6 | 131.4 | 128.1 | 124.5 | 120.5 | 116.2 | 111.7 | 107.0 | 102.3 | 97.9 | 93.8 | 90.2 | 87.1 | null | null | "Sub-Saharan Africa" | "Low income" | 35 | 35 |
"Cote d'Ivoire" | "CIV" | 208.4 | 203.0 | 197.7 | 192.8 | 188.0 | 183.3 | 178.7 | 174.2 | 169.9 | 165.4 | 161.0 | 156.4 | 151.3 | 146.1 | 140.7 | 135.1 | 129.7 | 124.7 | 120.2 | 116.6 | 113.7 | 111.4 | 109.5 | 108.0 | 106.9 | 106.1 | 105.5 | 105.2 | 104.9 | 104.9 | 104.9 | 104.8 | 104.7 | 104.7 | 104.6 | 104.4 | 104.0 | 103.3 | 102.3 | 101.0 | 99.5 | 97.7 | 95.7 | 93.6 | 91.4 | 88.9 | 86.7 | 84.1 | 81.3 | 79.0 | 76.9 | 75.0 | 72.8 | 70.6 | 68.5 | 66.6 | null | null | "Sub-Saharan Africa" | "Lower middle income" | 33 | 34 |
"Benin" | "BEN" | 186.9 | 183.9 | 180.6 | 177.1 | 173.6 | 170.2 | 166.8 | 164.0 | 161.5 | 159.2 | 157.1 | 154.9 | 152.5 | 149.8 | 146.8 | 143.5 | 140.1 | 136.7 | 133.6 | 130.9 | 128.7 | 126.6 | 124.7 | 122.8 | 120.9 | 118.9 | 116.9 | 114.8 | 112.6 | 110.4 | 108.0 | 105.6 | 103.2 | 100.9 | 98.9 | 97.2 | 95.6 | 94.2 | 92.7 | 91.1 | 89.3 | 87.4 | 85.2 | 83.0 | 80.8 | 78.8 | 76.9 | 75.2 | 73.7 | 72.3 | 71.0 | 69.8 | 68.5 | 67.2 | 65.7 | 64.2 | null | null | "Sub-Saharan Africa" | "Low income" | 32 | 33 |
"Burkina Faso" | "BFA" | 161.3 | 159.4 | 157.5 | 155.8 | 154.3 | 153.0 | 151.8 | 150.9 | 150.2 | 149.7 | 149.3 | 148.5 | 147.1 | 144.6 | 141.0 | 136.6 | 131.9 | 127.4 | 123.4 | 120.2 | 117.6 | 115.6 | 113.9 | 112.4 | 110.8 | 109.0 | 107.1 | 105.3 | 103.8 | 102.9 | 102.5 | 102.3 | 102.4 | 102.4 | 102.1 | 101.4 | 100.5 | 99.4 | 98.3 | 97.3 | 96.2 | 95.0 | 93.4 | 91.4 | 88.9 | 86.0 | 82.7 | 79.2 | 75.8 | 72.5 | 69.7 | 67.3 | 65.4 | 63.7 | 62.2 | 60.9 | null | null | "Sub-Saharan Africa" | "Low income" | 30 | 32 |
"Ghana" | "GHA" | 125.1 | 123.8 | 122.7 | 121.8 | 121.2 | 120.8 | 120.7 | 120.6 | 120.6 | 120.5 | 120.1 | 119.5 | 118.2 | 116.5 | 114.2 | 111.5 | 108.7 | 106.0 | 103.8 | 102.1 | 100.9 | 100.1 | 99.3 | 98.4 | 96.8 | 94.7 | 92.1 | 89.0 | 85.8 | 82.7 | 79.8 | 77.5 | 75.6 | 74.1 | 73.0 | 72.0 | 71.0 | 69.8 | 68.4 | 66.7 | 64.9 | 63.0 | 61.2 | 59.6 | 58.1 | 56.8 | 55.6 | 54.4 | 53.1 | 51.7 | 50.2 | 48.6 | 47.0 | 45.5 | 44.2 | 42.8 | null | null | "Sub-Saharan Africa" | "Lower middle income" | 25 | 31 |
First graph
(
ggplot(sampled_data)+ geom_text(aes(1, "y1960_rank", label="country"), ha="right", size=9)
+ geom_text(aes(2, "y2015_rank", label="country"), ha="left", size=9)
+ geom_point(aes(1, "y1960_rank", color="income_group"), size=2.5)
+ geom_point(aes(2, "y2015_rank", color="income_group"), size=2.5)
+ geom_segment(
=1, y="y1960_rank", xend=2, yend="y2015_rank", color="income_group")
aes(x
)+ scale_y_reverse()
)
It has the form we want, but we need to tweak it.
# Text colors
= "#252525"
black1 = "#222222"
black2
(
ggplot(sampled_data)# Slight modifications for the original lines,
# 1. Nudge the text to either sides of the points
# 2. Alter the color and alpha values
+ geom_text(
1, "y1960_rank", label="country"),
aes(=-0.05,
nudge_x="right",
ha=9,
size=black1,
color
)+ geom_text(
2, "y2015_rank", label="country"),
aes(=0.05,
nudge_x="left",
ha=9,
size=black1,
color
)+ geom_point(aes(1, "y1960_rank", color="income_group"), size=2.5, alpha=0.7)
+ geom_point(aes(2, "y2015_rank", color="income_group"), size=2.5, alpha=0.7)
+ geom_segment(
=1, y="y1960_rank", xend=2, yend="y2015_rank", color="income_group"),
aes(x=0.7,
alpha
)# Text Annotations
+ annotate(
"text",
=1,
x=0,
y="Rank in 1960",
label="bold",
fontweight="right",
ha=10,
size=black2,
color
)+ annotate(
"text",
=2,
x=0,
y="Rank in 2015",
label="bold",
fontweight="left",
ha=10,
size=black2,
color
)+ annotate(
"text", x=1.5, y=0, label="Lines show change in rank", size=9, color=black1
)+ annotate(
"label",
=1.5,
x=3,
y="Lower infant\ndeath rates",
label=9,
size=black1,
color=0,
label_size="italic",
fontstyle
)+ annotate(
"label",
=1.5,
x=33,
y="Higher infant\ndeath rates",
label=9,
size=black1,
color=0,
label_size="italic",
fontstyle
)# Prevent country names from being chopped off
+ lims(x=(0.35, 2.65))
+ labs(color="Income Group")
# Countries with lower rates on top
+ scale_y_reverse()
# Change colors
+ scale_color_brewer(type="qual", palette=2)
# Removes all decorations
+ theme_void()
# Changing the figure size prevents the country names from squishing up
+ theme(figure_size=(8, 11))
)