# NOTE: This notebook uses the polars package
import numpy as np
from plotnine import *
import polars as pl
from polars import col
An Elaborate Range Plot
segment
Comparing the point to point difference of many similar variables
Read the data.
Source: Pew Research Global Attitudes Spring 2015
!head -n 20 "data/survey-social-media.csv"
PSRAID,COUNTRY,Q145,Q146,Q70,Q74
100000,Ethiopia,Female,35,No,
100001,Ethiopia,Female,25,No,
100002,Ethiopia,Male,40,Don’t know,
100003,Ethiopia,Female,30,Don’t know,
100004,Ethiopia,Male,22,No,
100005,Ethiopia,Male,40,No,
100006,Ethiopia,Female,20,No,
100007,Ethiopia,Female,18,No,No
100008,Ethiopia,Male,50,No,
100009,Ethiopia,Male,35,No,
100010,Ethiopia,Female,20,No,
100011,Ethiopia,Female,30,Don’t know,
100012,Ethiopia,Male,60,No,
100013,Ethiopia,Male,18,No,
100014,Ethiopia,Male,40,No,
100015,Ethiopia,Male,28,Don’t know,
100016,Ethiopia,Female,55,Don’t know,
100017,Ethiopia,Male,30,Don’t know,
100018,Ethiopia,Female,22,No,
= dict(
columns ="country",
COUNTRY="gender",
Q145="age",
Q146="use_internet",
Q70="use_social_media",
Q74
)
= (
data
pl.scan_csv("data/survey-social-media.csv",
=dict(Q146=pl.Utf8),
dtypes
)
.rename(columns)"country", "age", "use_social_media"])
.select([
.collect()
)
10, seed=123) data.sample(
/tmp/ipykernel_3524/3539732853.py:10: DeprecationWarning: the argument `dtypes` for `scan_csv` is deprecated. It was renamed to `schema_overrides` in version 0.20.31.
shape: (10, 3)
country | age | use_social_media |
---|---|---|
str | str | str |
"India" | "23" | " " |
"Pakistan" | "18" | " " |
"Peru" | "39" | "Yes" |
"Jordan" | "56" | " " |
"United Kingdom" | "35" | "Yes" |
"Chile" | "24" | "Yes" |
"Israel" | "32" | "No" |
"Pakistan" | "39" | "No" |
"Chile" | "26" | "Yes" |
"Nigeria" | "43" | "Yes" |
Create age groups for users of social media
= ["Yes", "No"]
yes_no = ["18-34", "35-49", "50+"]
valid_age_groups
= (
rdata
data.with_columns(=pl.when(col("age") <= "34")
age_group"18-34"))
.then(pl.lit("age") <= "49")
.when(col("35-49"))
.then(pl.lit("age") < "98")
.when(col("50+"))
.then(pl.lit("")),
.otherwise(pl.lit(=pl.count().over("country"),
country_count
)filter(
."age_group").is_in(valid_age_groups) & col("use_social_media").is_in(yes_no)
col(
)"country", "age_group"])
.group_by([
.agg(# social media use percentage
=(col("use_social_media") == "Yes").sum() * 100 / pl.count(),
sm_use_percent# social media question response rate
=col("use_social_media").is_in(yes_no).sum()
smq_response_rate* 100
/ col("country_count").first(),
)"country", "age_group"])
.sort([
)
rdata.head()
/tmp/ipykernel_3524/3994701628.py:13: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)
/tmp/ipykernel_3524/3994701628.py:21: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)
shape: (5, 4)
country | age_group | sm_use_percent | smq_response_rate |
---|---|---|---|
str | str | f64 | f64 |
"Argentina" | "18-34" | 90.883191 | 35.1 |
"Argentina" | "35-49" | 84.40367 | 21.8 |
"Argentina" | "50+" | 67.333333 | 15.0 |
"Australia" | "18-34" | 90.862944 | 19.621514 |
"Australia" | "35-49" | 78.04878 | 20.418327 |
Top 14 countries by response rate to the social media question.
def col_format(name, fmt):
# Format useing python formating
# for more control over
return col(name).map_elements(lambda x: fmt.format(x=x))
def float_to_str_round(name):
return col_format(name, "{x:.0f}")
= 14
n
= (
top "country")
rdata.group_by(=col("smq_response_rate").sum())
.agg(r"r", descending=True)
.sort(
.head(n)
)= top["country"]
top_countries
= float_to_str_round("sm_use_percent")
expr = expr + "%"
expr_pct
= rdata.filter(col("country").is_in(top_countries)).with_columns(
point_data "country").cast(pl.Categorical),
col(=pl.when(col("country") == "France")
sm_use_percent_str
.then(expr_pct)
.otherwise(expr),
)
point_data.head()
/tmp/ipykernel_3524/2682800770.py:24: DeprecationWarning: `is_in` with a collection of the same datatype is ambiguous and deprecated.
Please use `implode` to return to previous behavior.
See https://github.com/pola-rs/polars/issues/22149 for more information.
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
shape: (5, 5)
country | age_group | sm_use_percent | smq_response_rate | sm_use_percent_str |
---|---|---|---|---|
cat | str | f64 | f64 | str |
"Australia" | "18-34" | 90.862944 | 19.621514 | "91" |
"Australia" | "35-49" | 78.04878 | 20.418327 | "78" |
"Australia" | "50+" | 48.479087 | 52.390438 | "48" |
"Canada" | "18-34" | 92.063492 | 25.099602 | "92" |
"Canada" | "35-49" | 75.925926 | 21.513944 | "76" |
= (
segment_data "country")
point_data.group_by(
.agg(min=col("sm_use_percent").min(),
max=col("sm_use_percent").max(),
)=(col("max") - col("min")))
.with_columns(gap
.sort("gap",
)
.with_columns(=float_to_str_round("min"),
min_str=float_to_str_round("max"),
max_str=float_to_str_round("gap"),
gap_str
)
)
segment_data.head()
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.
shape: (5, 7)
country | min | max | gap | min_str | max_str | gap_str |
---|---|---|---|---|---|---|
cat | f64 | f64 | f64 | str | str | str |
"Russia" | 76.07362 | 95.151515 | 19.077896 | "76" | "95" | "19" |
"Israel" | 55.405405 | 88.311688 | 32.906283 | "55" | "88" | "33" |
"United Kingdom" | 52.74463 | 86.096257 | 33.351627 | "53" | "86" | "33" |
"United States" | 52.597403 | 88.669951 | 36.072548 | "53" | "89" | "36" |
"Canada" | 53.986333 | 92.063492 | 38.077159 | "54" | "92" | "38" |
Format the floating point data that will be plotted into strings
First plot
# The right column (youngest-oldest gap) location
= 112
xgap
(
ggplot()# Range strip
+ geom_segment(
segment_data,="min", xend="max", y="country", yend="country"),
aes(x=6,
size="#a7a9ac",
color
)# Age group markers
+ geom_point(
point_data,"sm_use_percent", "country", color="age_group", fill="age_group"),
aes(=5,
size=0.7,
stroke
)# Age group percentages
+ geom_text(
filter(col("age_group") == "50+"),
point_data.
aes(="sm_use_percent-2",
x="country",
y="sm_use_percent_str",
label="age_group",
color
),=8,
size="right",
ha
)+ geom_text(
filter(col("age_group") == "35-49"),
point_data.="sm_use_percent+2", y="country", label="sm_use_percent_str"),
aes(x=8,
size="left",
ha="center",
va="white",
color
)+ geom_text(
filter(col("age_group") == "18-34"),
point_data.
aes(="sm_use_percent+2",
x="country",
y="sm_use_percent_str",
label="age_group",
color
),=8,
size="left",
ha
)# gap difference
+ geom_text(
segment_data,=xgap, y="country", label="gap_str"),
aes(x=9,
size="bold",
fontweight="+{}",
format_string
) )
Tweak it
# The right column (youngest-oldest gap) location
= 115
xgap
(
ggplot()# Background Strips # new
+ geom_segment(
segment_data,="country", yend="country"),
aes(y=0,
x=101,
xend=8.5,
size="#edece3",
color
)# vertical grid lines along the strips # new
+ annotate(
"segment",
=list(range(10, 100, 10)) * n,
x=list(range(10, 100, 10)) * n,
xend=np.tile(np.arange(1, n + 1), 9) - 0.25,
y=np.tile(np.arange(1, n + 1), 9) + 0.25,
yend="#CCCCCC",
color
)# Range strip
+ geom_segment(
segment_data,="min", xend="max", y="country", yend="country"),
aes(x=6,
size="#a7a9ac",
color
)# Age group markers
+ geom_point(
point_data,"sm_use_percent", "country", color="age_group", fill="age_group"),
aes(=5,
size=0.7,
stroke
)# Age group percentages
+ geom_text(
filter(col("age_group") == "50+"),
point_data.
aes(="sm_use_percent-2",
x="country",
y="sm_use_percent_str",
label="age_group",
color
),=8,
size="right",
ha
)+ geom_text(
filter(col("age_group") == "35-49"),
point_data.="sm_use_percent+2", y="country", label="sm_use_percent_str"),
aes(x=8,
size="left",
ha="center",
va="white",
color
)+ geom_text(
filter(col("age_group") == "18-34"),
point_data.
aes(="sm_use_percent+2",
x="country",
y="sm_use_percent_str",
label="age_group",
color
),=8,
size="left",
ha
)# countries right-hand-size (instead of y-axis) # new
+ geom_text(
segment_data,="country", label="country"),
aes(y=-1,
x=8,
size="right",
ha="bold",
fontweight="#222222",
color
)# gap difference
+ geom_vline(xintercept=xgap, color="#edece3", size=32) # new
+ geom_text(
segment_data,=xgap, y="country", label="gap_str"),
aes(x=9,
size="bold",
fontweight="+{}",
format_string
)# Annotations # new
+ annotate("text", x=31, y=n + 1.1, label="50+", size=9, color="#ea9f2f", va="top")
+ annotate(
"text", x=56, y=n + 1.1, label="35-49", size=9, color="#6d6e71", va="top"
)+ annotate(
"text", x=85, y=n + 1.1, label="18-34", size=9, color="#939c49", va="top"
)+ annotate(
"text",
=xgap,
x=n + 0.5,
y="Youngest-\nOldest Gap",
label=9,
size="#444444",
color="bottom",
va="center",
ha
)+ annotate("point", x=[31, 56, 85], y=n + 0.3, alpha=0.85, stroke=0)
+ annotate(
"segment",
=[31, 56, 85],
x=[31, 56, 85],
xend=n + 0.3,
y=n + 0.8,
yend=0.85,
alpha
)+ annotate(
"hline",
=[x + 0.5 for x in range(2, n, 2)],
yintercept=0.5,
alpha="dotted",
linetype=0.7,
size
)# Better spacing and color # new
+ scale_x_continuous(limits=(-18, xgap + 2))
+ scale_y_discrete(expand=(0, 0.25, 0.1, 0))
+ scale_fill_manual(values=["#c3ca8c", "#d1d3d4", "#f2c480"])
+ scale_color_manual(values=["#939c49", "#6d6e71", "#ea9f2f"])
+ guides(color=None, fill=None)
+ theme_void()
+ theme(figure_size=(8, 8.5))
)
Instead of looking at this plot as having a country variable on the y-axis
and a percentage variable on the x-axis
, we can view it as having vertically stacked up many indepedent variables, the values of which have a similar scale.