Joining Data

Code for quiz 6, more dplyr and our first interactive chart using echarts4r

Steps 1-6

  1. Load the R packages we will use.
  1. Read the data in the file drug_cos.csv, health_cos.csv in R and assign to the variable drug_cos and health_cos respectively.
drug_cos <- read_csv("https://estanny.com/static/week6/drug_cos.csv")
health_cos <- read_csv("https://estanny.com/static/week6/health_cos.csv")
  1. Use glimpse to get a glimpse of the data
drug_cos %>% glimpse()
Rows: 104
Columns: 9
$ ticker       <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS"~
$ name         <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoet~
$ location     <chr> "New Jersey; U.S.A", "New Jersey; U.S.A", "New ~
$ ebitdamargin <dbl> 0.149, 0.217, 0.222, 0.238, 0.182, 0.335, 0.366~
$ grossmargin  <dbl> 0.610, 0.640, 0.634, 0.641, 0.635, 0.659, 0.666~
$ netmargin    <dbl> 0.058, 0.101, 0.111, 0.122, 0.071, 0.168, 0.163~
$ ros          <dbl> 0.101, 0.171, 0.176, 0.195, 0.140, 0.286, 0.321~
$ roe          <dbl> 0.069, 0.113, 0.612, 0.465, 0.285, 0.587, 0.488~
$ year         <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,~
health_cos %>% glimpse()
Rows: 464
Columns: 11
$ ticker      <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS",~
$ name        <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoeti~
$ revenue     <dbl> 4233000000, 4336000000, 4561000000, 4785000000, ~
$ gp          <dbl> 2581000000, 2773000000, 2892000000, 3068000000, ~
$ rnd         <dbl> 427000000, 409000000, 399000000, 396000000, 3640~
$ netincome   <dbl> 245000000, 436000000, 504000000, 583000000, 3390~
$ assets      <dbl> 5711000000, 6262000000, 6558000000, 6588000000, ~
$ liabilities <dbl> 1975000000, 2221000000, 5596000000, 5251000000, ~
$ marketcap   <dbl> NA, NA, 16345223371, 21572007994, 23860348635, 2~
$ year        <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, ~
$ industry    <chr> "Drug Manufacturers - Specialty & Generic", "Dru~
  1. Which variables are the same in both data sets
names_drug <- drug_cos %>% names()
names_health <- health_cos %>% names()
intersect(names_drug, names_health)
[1] "ticker" "name"   "year"  
  1. Select subset of variables to work with
drug_subset <- drug_cos %>% 
  select(ticker, year, grossmargin) %>% 
  filter(year == 2018)

health_subset <- health_cos %>% 
  select(ticker, year, revenue, gp, industry) %>% 
  filter(year == 2018)
  1. Keep all the rows and columns drug_subset join with columns in health_subset
drug_subset %>% 
  left_join(health_subset)
# A tibble: 13 x 6
   ticker  year grossmargin     revenue          gp industry          
   <chr>  <dbl>       <dbl>       <dbl>       <dbl> <chr>             
 1 ZTS     2018       0.672  5825000000  3914000000 Drug Manufacturer~
 2 PRGO    2018       0.387  4731700000  1831500000 Drug Manufacturer~
 3 PFE     2018       0.79  53647000000 42399000000 Drug Manufacturer~
 4 MYL     2018       0.35  11433900000  4001600000 Drug Manufacturer~
 5 MRK     2018       0.681 42294000000 28785000000 Drug Manufacturer~
 6 LLY     2018       0.738 24555700000 18125700000 Drug Manufacturer~
 7 JNJ     2018       0.668 81581000000 54490000000 Drug Manufacturer~
 8 GILD    2018       0.781 22127000000 17274000000 Drug Manufacturer~
 9 BMY     2018       0.71  22561000000 16014000000 Drug Manufacturer~
10 BIIB    2018       0.865 13452900000 11636600000 Drug Manufacturer~
11 AMGN    2018       0.827 23747000000 19646000000 Drug Manufacturer~
12 AGN     2018       0.861 15787400000 13596000000 Drug Manufacturer~
13 ABBV    2018       0.764 32753000000 25035000000 Drug Manufacturer~

Question: join_ticker

drug_cos_subset <- drug_cos %>% 
  filter(ticker == "MYL")

drug_cos_subset
# A tibble: 8 x 9
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 MYL    Myla~ United ~        0.245       0.418     0.088 0.161 0.146
2 MYL    Myla~ United ~        0.244       0.428     0.094 0.163 0.184
3 MYL    Myla~ United ~        0.228       0.44      0.09  0.153 0.209
4 MYL    Myla~ United ~        0.242       0.457     0.12  0.169 0.283
5 MYL    Myla~ United ~        0.243       0.447     0.09  0.133 0.089
6 MYL    Myla~ United ~        0.19        0.424     0.043 0.052 0.044
7 MYL    Myla~ United ~        0.272       0.402     0.058 0.121 0.054
8 MYL    Myla~ United ~        0.258       0.35      0.031 0.074 0.028
# ... with 1 more variable: year <dbl>

combo_df <- drug_cos_subset %>%
  left_join(health_cos)

combo_df
# A tibble: 8 x 17
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 MYL    Myla~ United ~        0.245       0.418     0.088 0.161 0.146
2 MYL    Myla~ United ~        0.244       0.428     0.094 0.163 0.184
3 MYL    Myla~ United ~        0.228       0.44      0.09  0.153 0.209
4 MYL    Myla~ United ~        0.242       0.457     0.12  0.169 0.283
5 MYL    Myla~ United ~        0.243       0.447     0.09  0.133 0.089
6 MYL    Myla~ United ~        0.19        0.424     0.043 0.052 0.044
7 MYL    Myla~ United ~        0.272       0.402     0.058 0.121 0.054
8 MYL    Myla~ United ~        0.258       0.35      0.031 0.074 0.028
# ... with 9 more variables: year <dbl>, revenue <dbl>, gp <dbl>,
#   rnd <dbl>, netincome <dbl>, assets <dbl>, liabilities <dbl>,
#   marketcap <dbl>, industry <chr>

co_name <- combo_df %>% 
  distinct("MYL") %>% 
  pull()

co_location <- combo_df %>% 
  distinct(location) %>% 
  pull()

co_industry <- combo_df %>% 
  distinct(industry) %>% 
  pull()

The company MYL is located in the United Kingdom and is a member of the Drug Manufacturers - Specialty & Generic group.


combo_df_subset <- combo_df %>% 
  select(year, grossmargin, netmargin, revenue, gp, netincome)
combo_df_subset
# A tibble: 8 x 6
   year grossmargin netmargin     revenue         gp netincome
  <dbl>       <dbl>     <dbl>       <dbl>      <dbl>     <dbl>
1  2011       0.418     0.088  6129825000 2563364000 536810000
2  2012       0.428     0.094  6796100000 2908300000 640900000
3  2013       0.44      0.09   6909100000 3040300000 623700000
4  2014       0.457     0.12   7719600000 3528000000 929400000
5  2015       0.447     0.09   9429300000 4216100000 847600000
6  2016       0.424     0.043 11076900000 4697000000 480000000
7  2017       0.402     0.058 11907700000 4783100000 696000000
8  2018       0.35      0.031 11433900000 4001600000 352500000
combo_df_subset %>% 
  mutate(grossmargin_check = gp / revenue,
         close_enough = abs(grossmargin_check - grossmargin) < 0.001)
# A tibble: 8 x 8
   year grossmargin netmargin     revenue         gp netincome
  <dbl>       <dbl>     <dbl>       <dbl>      <dbl>     <dbl>
1  2011       0.418     0.088  6129825000 2563364000 536810000
2  2012       0.428     0.094  6796100000 2908300000 640900000
3  2013       0.44      0.09   6909100000 3040300000 623700000
4  2014       0.457     0.12   7719600000 3528000000 929400000
5  2015       0.447     0.09   9429300000 4216100000 847600000
6  2016       0.424     0.043 11076900000 4697000000 480000000
7  2017       0.402     0.058 11907700000 4783100000 696000000
8  2018       0.35      0.031 11433900000 4001600000 352500000
# ... with 2 more variables: grossmargin_check <dbl>,
#   close_enough <lgl>

combo_df_subset %>% 
  mutate(netmargin_check = netincome / revenue,
         close_enough = abs(netmargin_check - netmargin) < 0.001)
# A tibble: 8 x 8
   year grossmargin netmargin revenue     gp netincome netmargin_check
  <dbl>       <dbl>     <dbl>   <dbl>  <dbl>     <dbl>           <dbl>
1  2011       0.418     0.088 6.13e 9 2.56e9 536810000          0.0876
2  2012       0.428     0.094 6.80e 9 2.91e9 640900000          0.0943
3  2013       0.44      0.09  6.91e 9 3.04e9 623700000          0.0903
4  2014       0.457     0.12  7.72e 9 3.53e9 929400000          0.120 
5  2015       0.447     0.09  9.43e 9 4.22e9 847600000          0.0899
6  2016       0.424     0.043 1.11e10 4.70e9 480000000          0.0433
7  2017       0.402     0.058 1.19e10 4.78e9 696000000          0.0584
8  2018       0.35      0.031 1.14e10 4.00e9 352500000          0.0308
# ... with 1 more variable: close_enough <lgl>

Question: summarize_industry

health_cos %>% 
  group_by(industry) %>% 
  summarise(mean_grossmargin_percent = mean(gp/revenue)*100,
            median_grossmargin_percent = median(gp/revenue)*100,
            min_grossmargin_percent = min(gp/revenue)*100,
            max_grossmargin_percent = max(gp/revenue)*100)
# A tibble: 9 x 5
  industry          mean_grossmargi~ median_grossmar~ min_grossmargin~
  <chr>                        <dbl>            <dbl>            <dbl>
1 Biotechnology                 92.5            92.7             81.7 
2 Diagnostics & Re~             50.5            52.7             28.0 
3 Drug Manufacture~             75.4            76.4             36.8 
4 Drug Manufacture~             47.9            42.6             34.3 
5 Healthcare Plans              20.5            19.6             10.0 
6 Medical Care Fac~             55.9            37.4             28.1 
7 Medical Devices               70.8            72.0             53.2 
8 Medical Distribu~             10.4             5.38             2.49
9 Medical Instrume~             53.9            52.8             40.5 
# ... with 1 more variable: max_grossmargin_percent <dbl>

Question: inline_ticker

health_cos_subset <- health_cos %>% 
  filter(ticker == "AMGN")
health_cos_subset
# A tibble: 8 x 11
  ticker name     revenue      gp    rnd netincome  assets liabilities
  <chr>  <chr>      <dbl>   <dbl>  <dbl>     <dbl>   <dbl>       <dbl>
1 AMGN   Amgen I~ 1.56e10 1.29e10 3.17e9    3.68e9 4.89e10 29842000000
2 AMGN   Amgen I~ 1.73e10 1.41e10 3.38e9    4.34e9 5.43e10 35238000000
3 AMGN   Amgen I~ 1.87e10 1.53e10 4.08e9    5.08e9 6.61e10 44029000000
4 AMGN   Amgen I~ 2.01e10 1.56e10 4.30e9    5.16e9 6.90e10 43231000000
5 AMGN   Amgen I~ 2.17e10 1.74e10 4.07e9    6.94e9 7.14e10 43366000000
6 AMGN   Amgen I~ 2.30e10 1.88e10 3.84e9    7.72e9 7.76e10 47751000000
7 AMGN   Amgen I~ 2.28e10 1.88e10 3.56e9    1.98e9 8.00e10 54713000000
8 AMGN   Amgen I~ 2.37e10 1.96e10 3.74e9    8.39e9 6.64e10 53916000000
# ... with 3 more variables: marketcap <dbl>, year <dbl>,
#   industry <chr>

Run the code below

health_cos_subset %>% 
  distinct(name) %>% 
  pull(name)
[1] "Amgen Inc"
co_name <- health_cos_subset %>% 
  distinct(name) %>% 
  pull(name)
co_industry <- health_cos_subset %>% 
  distinct(industry) %>% 
  pull()

Steps 7-11

  1. Prepare the data for the plots
df <- health_cos %>% 
  group_by(industry) %>% 
  summarise(med_rnd_rev = median(rnd/revenue))
  1. Use glimpse to glimpse the data for the plots
df %>% glimpse()
Rows: 9
Columns: 2
$ industry    <chr> "Biotechnology", "Diagnostics & Research", "Drug~
$ med_rnd_rev <dbl> 0.48317287, 0.05620271, 0.17451442, 0.06851879, ~
  1. Create a static bar chart
ggplot(data = df,
       mapping = aes(
         x = reorder(industry, med_rnd_rev),
         y = med_rnd_rev)) +
  geom_col()+
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(
    title = "Median R&D expenditures",
    subtitle = "by industry as a percent of revenue from 2011 to 2018",
    x = NULL, Y = NULL) +
  theme_ipsum()

  1. Save the previous plot to preview.png and add to the yaml chunk at the top
ggsave(filename = "preview.png",
       path = here::here("_posts", "2022-03-03-joining-data"))
  1. Create an interactive bar chart using the package echarts4r
df %>% 
  arrange(med_rnd_rev) %>% 
  e_charts(x = industry) %>% 
  e_bar(serie = med_rnd_rev, 
        name = "median") %>% 
  e_flip_coords() %>% 
  e_tooltip() %>% 
  e_title(text = "Median industry R&D expenditures",
          subtext = "by industry as a percent of revenue from 2011 to 2018",
          left = "center") %>% 
  e_legend(FALSE) %>% 
  e_x_axis(formatter = e_axis_formatter("percent", digits = 0)) %>% 
  e_y_axis(show = FALSE) %>% 
  e_theme("chalk")