(vis-common-plots-two)=
# Common Plots II

## Introduction

Carrying on from the previous chapter, we'll look at more of the most common plots that you might want to make—and how to create them using the most popular data visualisations libraries, including [**matplotlib**](https://matplotlib.org/), [**lets-plot**](https://lets-plot.org/), [**seaborn**](https://seaborn.pydata.org/), [**altair**](https://altair-viz.github.io/), and [**plotly**](https://plotly.com/python/).

Let's import the libraries we'll need.

In [None]:
import warnings
from pathlib import Path

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import seaborn.objects as so
from lets_plot import *
from lets_plot.mapping import as_discrete
from vega_datasets import data

# Set seed for reproducibility
# Set seed for random numbers
seed_for_prng = 78557
prng = np.random.default_rng(
    seed_for_prng
)  # prng=probabilistic random number generator

# Turn off warnings
warnings.filterwarnings("ignore")
# Set up lets-plot charts
LetsPlot.setup_html()

In [None]:
import matplotlib_inline.backend_inline

# Plot settings
plt.style.use(
    "https://github.com/aeturrell/coding-for-economists/raw/main/plot_style.txt"
)
matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
# some faffing here to try and get seaborn not to change theme in object API
# sns.set_theme(rc=plt.rcParams)
# Set max rows displayed for readability
pd.set_option("display.max_rows", 6)

## Overlapping Area plot

For this, let's look at the dominance of the three most used methods for detecting exoplanets.

In [None]:
planets = sns.load_dataset("planets")
most_pop_methods = (
    planets.groupby(["method"])["number"]
    .sum()
    .sort_values(ascending=False)
    .index[:3]
    .values
)
planets = planets[planets["method"].isin(most_pop_methods)]
planets.head()

### Matplotlib

The easiest way to do this in matplotlib is to adjust the data a bit first and then use the built-in **pandas** plot function. (This is true in other cases too, but in this case it's much more complex otherwise).

In [None]:
(
    planets.groupby(["year", "method"])["number"]
    .sum()
    .unstack()
    .plot.area(alpha=0.6, ylim=(0, None))
    .set_title("Planets dicovered by top 3 methods", loc="left")
);

### Seaborn

In [None]:
(
    so.Plot(
        planets.groupby(["year", "method"])["number"].sum().reset_index(),
        x="year",
        y="number",
        color="method",
    ).add(so.Area(alpha=0.3), so.Agg(), so.Stack())
)

### Lets-Plot



In [None]:
(
    ggplot(
        planets.groupby(["year", "method"])["number"].sum().reset_index(),
        aes(x="year", y="number", fill="method", group="method", color="method"),
    )
    + geom_area(stat="identity", alpha=0.5)
    + scale_x_continuous(format="d")
)

### Altair


In [None]:
alt.Chart(
    planets.groupby(["year", "method"])["number"]
    .sum()
    .reset_index()
    .assign(
        year=lambda x: pd.to_datetime(x["year"], format="%Y")
        + pd.tseries.offsets.YearEnd()
    )
).mark_area().encode(x="year:T", y="number:Q", color="method:N")

## Slope chart

A slope chart has two points connected by a line and is good for indicating how relationships between variables have changed over time.

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv"
)
df = pd.melt(
    df,
    id_vars=["continent"],
    value_vars=df.columns[1:],
    value_name="GDP per capita",
    var_name="Year",
).rename(columns={"continent": "Continent"})
df.head()

### Matplotlib

There isn't an off-the-shelf way to do this in matplotlib but the example below shows that, with matplotlib, where there's a will there's a way! It's where the 'build-what-you-want' comes into its own. Note that the functino that's defined returns an `Axes` object so that you can do further processing and tweaking as you like.

In [None]:
from matplotlib import lines as mlines


def slope_plot(data, x, y, group, before_txt="Before", after_txt="After"):
    if len(data[x].unique()) != 2:
        raise ValueError("Slope plot must have two unique periods.")
    wide_data = data[[x, y, group]].pivot(index=group, columns=x, values=y)
    x_names = list(wide_data.columns)

    fig, ax = plt.subplots()

    def newline(p1, p2, color="black"):
        ax = plt.gca()
        line = mlines.Line2D(
            [p1[0], p2[0]],
            [p1[1], p2[1]],
            color="red" if p1[1] - p2[1] > 0 else "green",
            marker="o",
            markersize=6,
        )
        ax.add_line(line)
        return line

    # Vertical Lines
    y_min = data[y].min()
    y_max = data[y].max()
    ax.vlines(
        x=1,
        ymin=y_min,
        ymax=y_max,
        color="black",
        alpha=0.7,
        linewidth=1,
        linestyles="dotted",
    )
    ax.vlines(
        x=3,
        ymin=y_min,
        ymax=y_max,
        color="black",
        alpha=0.7,
        linewidth=1,
        linestyles="dotted",
    )
    # Points
    ax.scatter(
        y=wide_data[x_names[0]],
        x=np.repeat(1, wide_data.shape[0]),
        s=15,
        color="black",
        alpha=0.7,
    )
    ax.scatter(
        y=wide_data[x_names[1]],
        x=np.repeat(3, wide_data.shape[0]),
        s=15,
        color="black",
        alpha=0.7,
    )
    # Line Segmentsand Annotation
    for p1, p2, c in zip(wide_data[x_names[0]], wide_data[x_names[1]], wide_data.index):
        newline([1, p1], [3, p2])
        ax.text(
            1 - 0.05,
            p1,
            c,
            horizontalalignment="right",
            verticalalignment="center",
            fontdict={"size": 14},
        )
        ax.text(
            3 + 0.05,
            p2,
            c,
            horizontalalignment="left",
            verticalalignment="center",
            fontdict={"size": 14},
        )
    # 'Before' and 'After' Annotations
    ax.text(
        1 - 0.05,
        y_max + abs(y_max) * 0.1,
        before_txt,
        horizontalalignment="right",
        verticalalignment="center",
        fontdict={"size": 16, "weight": 700},
    )
    ax.text(
        3 + 0.05,
        y_max + abs(y_max) * 0.1,
        after_txt,
        horizontalalignment="left",
        verticalalignment="center",
        fontdict={"size": 16, "weight": 700},
    )
    # Decoration
    ax.set(
        xlim=(0, 4), ylabel=y, ylim=(y_min - 0.1 * abs(y_min), y_max + abs(y_max) * 0.1)
    )
    ax.set_xticks([1, 3])
    ax.set_xticklabels(x_names)
    # Lighten borders
    for ax_pos in ["top", "bottom", "right", "left"]:
        ax.spines[ax_pos].set_visible(False)
    return ax


slope_plot(df, x="Year", y="GDP per capita", group="Continent");

### Seaborn

In [None]:
(
    so.Plot(df, x="Year", y="GDP per capita", color="Continent")
    .add(so.Line(marker="o"), so.Agg())
    .add(so.Range())
)

### Lets-Plot

In [None]:
(
    ggplot(df, aes(x="Year", y="GDP per capita", group="Continent"))
    + geom_line(aes(color="Continent"), size=1)
    + geom_point(aes(color="Continent"), size=4)
)

### Altair

In [None]:
alt.Chart(df).mark_line().encode(x="Year:O", y="GDP per capita", color="Continent")

### Plotly

In [None]:
import plotly.graph_objects as go

yr_names = [int(x) for x in df["Year"].unique()]
px_df = (
    df.pivot(index="Continent", columns="Year", values="GDP per capita")
    .reset_index()
    .rename(columns=dict(zip(df["Year"].unique(), range(len(df["Year"].unique())))))
)

x_offset = 5

fig1 = go.Figure()
# Draw lines
for index, row in px_df.iterrows():
    fig1.add_shape(
        type="line",
        x0=yr_names[0],
        y0=row[0],
        x1=yr_names[1],
        y1=row[1],
        name=row["Continent"],
        line=dict(color=px.colors.qualitative.Plotly[index]),
    )
    fig1.add_trace(
        go.Scatter(
            x=[yr_names[0]],
            y=[row[0]],
            text=row["Continent"],
            mode="text",
            name=None,
        )
    )


fig1.update_xaxes(range=[yr_names[0] - x_offset, yr_names[1] + x_offset])
fig1.update_yaxes(
    range=[px_df[[0, 1]].min().min() * 0.8, px_df[[0, 1]].max().max() * 1.2]
)
fig1.update_layout(showlegend=False)
fig1.show()

## Dumbbell Plot

These are excellent for showing a change in time with a large number of categories, as we will do here with continents and mean GDP per capita.

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv"
)
df = pd.melt(
    df,
    id_vars=["continent"],
    value_vars=df.columns[1:],
    value_name="GDP per capita",
    var_name="Year",
).rename(columns={"continent": "Continent"})
df.head()

### Matplotlib

Again, no off-the-shelf method--but that's no problem when you can build it yourself.

In [None]:
def dumbbell_plot(data, x, y, change):
    if len(data[x].unique()) != 2:
        raise ValueError("Dumbbell plot must have two unique periods.")
    if not isinstance(data[y].iloc[0], str):
        raise ValueError("Dumbbell plot y variable only works with category values.")
    wide_data = data[[x, y, change]].pivot(index=y, columns=x, values=change)
    x_names = list(wide_data.columns)
    y_names = list(wide_data.index)

    def newline(p1, p2, color="black"):
        ax = plt.gca()
        line = mlines.Line2D([p1[0], p2[0]], [p1[1], p2[1]], color="skyblue", zorder=0)
        ax.add_line(line)
        return line

    fig, ax = plt.subplots()
    # Points
    ax.scatter(
        y=range(len(y_names)),
        x=wide_data[x_names[1]],
        s=50,
        color="#0e668b",
        alpha=0.9,
        zorder=2,
        label=x_names[1],
    )
    ax.scatter(
        y=range(len(y_names)),
        x=wide_data[x_names[0]],
        s=50,
        color="#a3c4dc",
        alpha=0.9,
        zorder=1,
        label=x_names[0],
    )
    # Line segments
    for i, p1, p2 in zip(
        range(len(y_names)), wide_data[x_names[0]], wide_data[x_names[1]]
    ):
        newline([p1, i], [p2, i])
    ax.set_yticks(range(len(y_names)))
    ax.set_yticklabels(y_names)
    # Decoration
    # Lighten borders
    for ax_pos in ["top", "right", "left"]:
        ax.spines[ax_pos].set_visible(False)
    ax.set_xlabel(change)
    ax.legend(frameon=False, loc="lower right")
    plt.show()


dumbbell_plot(df, x="Year", y="Continent", change="GDP per capita")

### Seaborn

In [None]:
(
    so.Plot(df, y="Continent", x="GDP per capita", color="Year").add(
        so.Dots(pointsize=10, fillalpha=1)
    )
)

### Lets-Plot

In [None]:
(
    ggplot(df, aes(y="Continent", x="GDP per capita", group="Continent"))
    + geom_line(color="black", size=2)
    + geom_point(aes(color="Year"), size=5)
    + ggsize(400, 500)
)

### Plotly

In [None]:
import plotly.graph_objects as go

fig1 = go.Figure()

yr_names = df["Year"].unique()


# Draw lines
for i, cont in enumerate(df["Continent"].unique()):
    cdf = df[df["Continent"] == cont]
    fig1.add_shape(
        type="line",
        x0=cdf.loc[cdf["Year"] == yr_names[0], "GDP per capita"].values[0],
        y0=cont,
        x1=cdf.loc[cdf["Year"] == yr_names[1], "GDP per capita"].values[0],
        y1=cont,
        line=dict(color=px.colors.qualitative.Plotly[0], width=2),
    )
# Draw points
for i, year in enumerate(yr_names):
    yrdf = df[df["Year"] == year]
    fig1.add_trace(
        go.Scatter(
            y=yrdf["Continent"],
            x=yrdf["GDP per capita"],
            mode="markers",
            name=year,
            marker_color=px.colors.qualitative.Plotly[i],
            marker_size=10,
        ),
    )

fig1.show()

## Polar

I'm not sure I've ever seen a polar plots in economics, but you never know.

Let's generate some polar data first:


In [None]:
r = np.arange(0, 2, 0.01)
theta = 2 * np.pi * r
polar_data = pd.DataFrame({"r": r, "theta": theta})
polar_data.head()

### Matplotlib


In [None]:
ax = plt.subplot(111, projection="polar")
ax.plot(polar_data["theta"], polar_data["r"])
ax.set_rmax(2)
ax.set_rticks([0.5, 1, 1.5, 2])  # Fewer radial ticks
ax.set_rlabel_position(-22.5)  # Move radial labels away from plotted line
ax.grid(True)
plt.show()

### Plotly

In [None]:
fig = go.Figure(
    data=go.Scatterpolar(
        r=polar_data["r"].values,
        theta=polar_data["theta"].values * 180 / (np.pi),
        mode="lines",
    )
)

fig.update_layout(showlegend=False)
fig.show()

## Radar (or spider) chart

Let's generate some synthetic data for this one. Assumes that result to be shown is the sum of observations.

In [None]:
df = pd.DataFrame(
    dict(
        zip(
            ["var" + str(i) for i in range(1, 6)],
            [np.random.randint(30, size=(4)) for i in range(1, 6)],
        )
    )
)
df.head()

In [None]:
from math import pi


def radar_plot(data, variables):
    n_vars = len(variables)
    # Plot the first line of the data frame.
    # Repeat the first value to close the circular graph:
    values = data.loc[data.index[0], variables].values.flatten().tolist()
    values += values[:1]
    # What will be the angle of each axis in the plot? (we divide / number of variable)
    angles = [n / float(n_vars) * 2 * pi for n in range(n_vars)]
    angles += angles[:1]
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], variables)
    # Draw ylabels
    ax.set_rlabel_position(0)
    # Plot data
    ax.plot(angles, values, linewidth=1, linestyle="solid")
    # Fill area
    ax.fill(angles, values, "b", alpha=0.1)
    return ax


radar_plot(df, df.columns);

### Plotly

In [None]:
df = px.data.wind()
print(df.head())
fig = px.line_polar(
    df,
    r="frequency",
    theta="direction",
    color="strength",
    line_close=True,
    color_discrete_sequence=px.colors.sequential.Plasma_r,
    template="plotly_dark",
)
fig.show()

## Wordcloud

These should be used sparingly. Let's grab part of a famous text from Project Gutenberg:

In [None]:
# To run this example, download smith_won.txt from
# https://github.com/aeturrell/coding-for-economists/blob/main/data/smith_won.txt
# and put it in a sub-folder called 'data

book_text = open(Path("data", "smith_won.txt"), "r", encoding="utf-8").read()
# Print some lines
print("\n".join(book_text.split("\n")[107:117]))

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=700, height=400).generate(book_text)
fig, ax = plt.subplots(facecolor="k")
ax.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout();

We can also create a 'mask' for the wordcloud to shape it how we like, here in the shape of a book.

In [None]:
# To run this example, download book_mask.png from
# https://github.com/aeturrell/coding-for-economists/raw/main/data/book_mask.png
# and put it in a sub-folder called 'data
from PIL import Image

mask = np.array(Image.open(Path("data", "book_mask.png")))
wc = WordCloud(width=700, height=400, mask=mask, background_color="white")
wordcloud = wc.generate(book_text)
fig, ax = plt.subplots(facecolor="white")
ax.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout();

## Network diagrams

### networkx

The most well-established network visualisation package is [**networkx**](https://networkx.org/documentation/stable/index.html), which does a *lot* more than just visualisation. It has many different positioning options for rendering any given network, for instance in circular, spectral, spring, Fruchterman-Reingold, or other styles. In the below example, we use a **pandas** dataframe to specify the edges in two columns but there are various other ways to specify the network too, including ones that do not rely on **pandas**.

The underlying plot is rendered with **matplotlib**, meaning that you can customise it further should you need to. You can pass an `Axes` object `ax` to `nx.draw()` using `nx.draw(..., ax=ax)`.

In [None]:
import networkx as nx

df = pd.DataFrame(
    {
        "source": ["A", "B", "C", "A", "E", "F", "E", "G", "G", "D", "F"],
        "target": ["D", "A", "E", "C", "A", "F", "G", "D", "B", "G", "C"],
    }
)
G = nx.from_pandas_edgelist(df)
nx.draw(G, with_labels=True, node_size=500, node_color="skyblue")

## Ridge, or 'joy', plots

These are famous from the front cover of “Unkown Pleasures” by Joy Division. Let's look at an example showing the global increase in temperature.

We'll use a summary of the daily land-surface average temperature anomaly produced by the Berkeley Earth averaging method. Temperatures are in Celsius and reported as anomalies relative to the Jan 1951-Dec 1980 average (the estimated Jan 1951-Dec 1980 land-average temperature is 8.63 +/- 0.06 C).

In [None]:
# To run this example, download the pickle file from
# https://github.com/aeturrell/coding-for-economists/blob/main/data/berkeley_data.pkl
# and put it in a sub-folder called 'data'
df = pd.read_pickle(Path("data/berkeley_data.pkl"))
df.head()

### Lets-Plot

In [None]:
final_year = df["Year"].max()
first_year = df["Year"].min()

breaks = [y for y in list(df.Year.unique()) if y % 10 == 0]
(
    ggplot(df, aes("Anomaly", "Year", fill="Year"))
    + geom_area_ridges(scale=20, alpha=1, size=0.2, trim=True, show_legend=False)
    + scale_y_continuous(breaks=breaks, trans="reverse")
    + scale_fill_viridis(option="inferno")
    + ggtitle(
        "Global daily temperature anomaly {0}-{1} \n(°C above 1951-80 average)".format(
            first_year, final_year
        )
    )
)

## Contour Plot

Contour plots can help you show how a third variable, Z, varies with both X and Y (ie Z is a surface). The way that Z is depicted could be via the density of lines drawn in the X-Y plane (use `ax.contour()` for this) or via colour, as in the example below (using `ax.contourf()`).

The heatmap (or contour plot) below, which has a colour bar legend and a title that's rendered with latex, uses a perceptually uniform distribution that makes equal changes look equal; **matplotlib** has a few of these. If you need more colours, check out the packages [**colorcet**](https://colorcet.holoviz.org/) and [**palettable**](https://jiffyclub.github.io/palettable/).

### Matplotlib

Note that, in the below, Z is returned by a function that accepts a grid of X and Y values.

In [None]:
def f(x, y):
    return np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)


x = np.linspace(0, 5, 100)
y = np.linspace(0, 5, 100)

X, Y = np.meshgrid(x, y)
Z = f(X, Y)

fig, ax = plt.subplots()
cf = ax.contourf(X, Y, Z, cmap="plasma")
ax.set_title(r"$f(x,y) = \sin^{10}(x) + \cos(x)\cos\left(10 + y\cdot x\right)$")
cbar = fig.colorbar(cf);

### Lets-Plot

In [None]:
contour_data = {"x": X.flatten(), "y": Y.flatten(), "z": Z.flatten()}
(
    ggplot(contour_data)
    + geom_contourf(aes(x="x", y="y", z="z", fill="..level.."))
    + scale_fill_viridis(option="plasma")
    + ggtitle("Maths equations don't currently work")
)

### Plotly

In [None]:
import plotly.graph_objects as go

grid_fig = go.Figure(data=go.Contour(z=Z, x=x, y=y))

grid_fig.show()

## Waterfall chart

Waterfall charts are good for showing how different contributions combine to net out at a certain value. There's a package dedicated to them called [**waterfallcharts**](https://github.com/chrispaulca/waterfall). It builds on **matplotlib**. First, let's create some data:

In [None]:
a = ["sales", "returns", "credit fees", "rebates", "late charges", "shipping"]
b = [10, -30, -7.5, -25, 95, -7]

Now let's plot this data. Because the defaults of **waterfallcharts** don't play that nicely with the plot style used for this book, we'll temporarily switch back to the **matplotlib** default plot style using a *context* and `with` statement:

In [None]:
import waterfall_chart

with plt.style.context("default"):
    plot = waterfall_chart.plot(a, b, sorted_value=True, rotation_value=0)

### Plotly



In [None]:
import plotly.graph_objects as go

px_b = b + [sum(b)]

fig = go.Figure(
    go.Waterfall(
        name="20",
        orientation="v",
        measure=["relative"] * len(a) + ["total"],
        x=a + ["net"],
        textposition="outside",
        text=[str(x) for x in b] + ["net"],
        y=px_b,
        connector={"line": {"color": "rgb(63, 63, 63)"}},
    )
)

fig.show()

## Venn

Venn diagrams show the overlap between groups. As with some of these other, more unsual chart types, there's a special package that produces these and which builds on **matplotlib**.

In [None]:
from matplotlib_venn import venn2

venn2(subsets=(10, 5, 2), set_labels=("Group A", "Group B"), alpha=0.5)
plt.show()

## Priestley Timeline

This displays a timeline of start and end events in time, and their overlap.

In [None]:
df = pd.read_csv(
    "https://github.com/aeturrell/coding-for-economists/raw/main/data/priestley-timeline.csv",
    parse_dates=["Born", "Died"],
    dayfirst=True,
)
df = df.sort_values("Born")

# Create the plot
fig, ax = plt.subplots(figsize=(12, 6))

for i, (index, row) in enumerate(df.iterrows()):
    lifespan = (row["Died"] - row["Born"]).days
    bar = ax.barh(len(df) - 1 - i, lifespan, left=row["Born"], height=0.5)
    text_x = row["Born"] + pd.Timedelta(days=lifespan / 2)

    # Add text inside the bar
    ax.text(
        text_x,
        len(df) - 1 - i,
        row["Name"],
        va="center",
        ha="center",
        color="k",
        fontweight="bold",
        fontsize=8,
    )

ax.set_yticks([])
plt.xlabel("Year")
plt.show()

## Waffle, isotype, or pictogram charts

These are great for showing easily-understandable magnitudes.

### Matplotlib

There is a package called [**pywaffle**](https://github.com/gyli/PyWaffle) that provides a convenient way of doing this. It expects a dictionary of values. Note that the icon can be changed and, because it builds on **matplotlib**, you can tweak to your heart's content.

In [None]:
from pywaffle import Waffle

data = {"Democratic": 48, "Republican": 46, "Libertarian": 3}
fig = plt.figure(
    FigureClass=Waffle,
    rows=5,
    values=data,
    colors=["#232066", "#983D3D", "#DCB732"],
    legend={"loc": "upper left", "bbox_to_anchor": (1, 1)},
    icons="child",
    font_size=12,
    icon_legend=True,
)
plt.show()

### Lets-Plot

As ever, **Lets-Plot** prefers tidy format data. We'll create a mini dataset just to demonstrate its use:

In [None]:
import itertools

df = pd.DataFrame(list(itertools.product(range(10), range(10))), columns=["x", "y"])
df["filled"] = 0
df.iloc[:32, 2] = 1
df.head()

In [None]:
g = (
    ggplot(df, aes(x="x", y="y", fill=as_discrete("filled")))
    + geom_tile(alpha=0.5, color="black")
    + scale_fill_manual(["green", "blue"])
    + coord_flip()
    + geom_text(x=5, y=5, label=f"{int(100*df.filled.mean())}%", size=30, color="white")
    + theme(
        axis=element_blank(),
        panel_grid_major=element_blank(),
        panel_grid_minor=element_blank(),
    )
    + xlab("")
    + ylab("")
)
g

## Pyramid



In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv"
)
df.head()

### Matplotlib/Seaborn

In [None]:
fig, ax = plt.subplots()
group_col = "Gender"
order_of_bars = df.Stage.unique()[::-1]
colors = [
    plt.cm.Spectral(i / float(len(df[group_col].unique()) - 1))
    for i in range(len(df[group_col].unique()))
]

for c, group in zip(colors, df[group_col].unique()):
    sns.barplot(
        x="Users",
        y="Stage",
        data=df.loc[df[group_col] == group, :],
        order=order_of_bars,
        color=c,
        label=group,
        ax=ax,
        lw=0,
    )

divisor = 1e6
ax.set_xticklabels([str(abs(x) / divisor) for x in ax.get_xticks()])
plt.xlabel("Users (millions)")
plt.ylabel("Stage of Purchase")
plt.yticks(fontsize=12)
plt.title("Population Pyramid of the Marketing Funnel", fontsize=22)
plt.legend(frameon=False)
plt.show()

### Lets-Plot

Unfortunately, the 20 character limit is hardcoded, so y labels are cut off. But the full text can be seen in the axial tooltip.

In [None]:
g = (
    ggplot(df, aes(x="Stage", y="Users", fill="Gender", weight="Users"))
    + geom_bar(width=0.8)  # baseplot
    + coord_flip()  # flip coordinates
    + theme_minimal()
    + ylab("Users (millions)")
)
g

### Plotly

In [None]:
fig = px.funnel(df, y="Stage", x="Users")
fig.show()

## Sankey diagram

Sankey diagrams show how a flow breaks into pieces.

### Plotly

In [None]:
import plotly.graph_objects as go

labels = ["A1", "A2", "B1", "B2", "C1", "C2"]

fig = go.Figure(
    data=[
        go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=labels,
                color=px.colors.qualitative.Plotly[: len(labels)],
            ),
            # indices correspond to labels, eg A1, A2, A1, B1, ...
            link=dict(
                source=[0, 1, 0, 2, 3, 3, 2],
                target=[2, 3, 3, 4, 4, 5, 5],
                value=[7, 3, 2, 6, 4, 2, 1],
            ),
        )
    ]
)

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

## Dendrogram or hierarchical clustering



### Seaborn



In [None]:
# Data
df = (
    pd.read_csv(
        "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv"
    )
    .rename(columns={"rownames": "Model"})
    .set_index("Model")
)
# Plot
sns.clustermap(
    df, metric="correlation", method="single", standard_scale=1, cmap="vlag"
);

## Treemap


### Plotly


In [None]:
import numpy as np
import plotly.express as px

df = px.data.gapminder().query("year == 2007")
fig = px.treemap(
    df,
    path=[px.Constant("world"), "continent", "country"],
    values="pop",
    color="lifeExp",
    hover_data=["iso_alpha"],
    color_continuous_scale="RdBu",
    color_continuous_midpoint=np.average(df["lifeExp"], weights=df["pop"]),
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()