Data Analysis

Load the packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RColorBrewer)

Load the cleaned data back

load("data/df_cleaned.RData")

Inspect the cleaned data

What are the companies included in this dataset?

unique(df_cleaned$company)
  [1] "Abu Dhabi National Oil Company"         
  [2] "Adani Enterprises"                      
  [3] "Adaro Energy"                           
  [4] "Alliance Resource Partners"             
  [5] "Alpha Metallurgical Resources"          
  [6] "American Consolidated Natural Resources"
  [7] "Anglo American"                         
  [8] "Antero"                                 
  [9] "APA Corporation"                        
 [10] "Arch Resources"                         
 [11] "Banpu"                                  
 [12] "Bapco Energies"                         
 [13] "BASF"                                   
 [14] "BHP"                                    
 [15] "BP"                                     
 [16] "British Coal Corporation"               
 [17] "Bumi Resources"                         
 [18] "Canadian Natural Resources"             
 [19] "Cemex"                                  
 [20] "Cenovus Energy"                         
 [21] "Chesapeake Energy"                      
 [22] "Chevron"                                
 [23] "China (Cement)"                         
 [24] "China (Coal)"                           
 [25] "Cloud Peak"                             
 [26] "CNOOC"                                  
 [27] "CNPC"                                   
 [28] "CNX Resources"                          
 [29] "Coal India"                             
 [30] "ConocoPhillips"                         
 [31] "CONSOL Energy"                          
 [32] "Continental Resources"                  
 [33] "Coterra Energy"                         
 [34] "CRH"                                    
 [35] "Cyprus AMAX Minerals"                   
 [36] "Czech Republic"                         
 [37] "Czechoslovakia"                         
 [38] "Devon Energy"                           
 [39] "Ecopetrol"                              
 [40] "Egyptian General Petroleum"             
 [41] "Eni"                                    
 [42] "EOG Resources"                          
 [43] "EQT Corporation"                        
 [44] "Equinor"                                
 [45] "Exxaro Resources Ltd"                   
 [46] "ExxonMobil"                             
 [47] "Former Soviet Union"                    
 [48] "Gazprom"                                
 [49] "Glencore"                               
 [50] "Heidelberg Materials"                   
 [51] "Hess Corporation"                       
 [52] "Holcim Group"                           
 [53] "Inpex"                                  
 [54] "Iraq National Oil Company"              
 [55] "Kazakhstan"                             
 [56] "Kiewit Mining Group"                    
 [57] "Kuwait Petroleum Corp."                 
 [58] "Libya National Oil Corp."               
 [59] "Lukoil"                                 
 [60] "Marathon Oil"                           
 [61] "Murphy Oil"                             
 [62] "Naftogaz"                               
 [63] "National Iranian Oil Co."               
 [64] "Navajo Transitional Energy Company"     
 [65] "Nigerian National Petroleum Corp."      
 [66] "North American Coal"                    
 [67] "North Korea"                            
 [68] "Novatek"                                
 [69] "Obsidian Energy"                        
 [70] "Occidental Petroleum"                   
 [71] "OMV Group"                              
 [72] "ONGC India"                             
 [73] "Orlen"                                  
 [74] "Ovintiv"                                
 [75] "Peabody Coal Group"                     
 [76] "Pemex"                                  
 [77] "Pertamina"                              
 [78] "Petoro"                                 
 [79] "Petrobras"                              
 [80] "PetroEcuador"                           
 [81] "Petroleos de Venezuela"                 
 [82] "Petroleum Development Oman"             
 [83] "Petronas"                               
 [84] "Pioneer Natural Resources"              
 [85] "Poland"                                 
 [86] "PTTEP"                                  
 [87] "QatarEnergy"                            
 [88] "Repsol"                                 
 [89] "Rio Tinto"                              
 [90] "Rosneft"                                
 [91] "Russian Federation"                     
 [92] "RWE"                                    
 [93] "Santos"                                 
 [94] "Sasol"                                  
 [95] "Saudi Aramco"                           
 [96] "Seriti Resources"                       
 [97] "Shell"                                  
 [98] "Singareni Collieries"                   
 [99] "Sinopec"                                
[100] "Slovakia"                               
[101] "SM Energy"                              
[102] "Sonangol"                               
[103] "Sonatrach"                              
[104] "Southwestern Energy"                    
[105] "Suncor Energy"                          
[106] "Surgutneftegas"                         
[107] "Syrian Petroleum"                       
[108] "Taiheiyo Cement"                        
[109] "Teck Resources"                         
[110] "TotalEnergies"                          
[111] "Tourmaline Oil"                         
[112] "Tullow Oil"                             
[113] "TurkmenGaz"                             
[114] "UK Coal"                                
[115] "Ukraine"                                
[116] "Vale"                                   
[117] "Vistra"                                 
[118] "Westmoreland Mining"                    
[119] "Whitehaven Coal"                        
[120] "Wolverine Fuels"                        
[121] "Woodside Energy"                        
[122] "YPF"                                    

How many kinds of commodities are included in this dataset?

unique(df_cleaned$commodity)
[1] "Oil & NGL"           "Natural Gas"         "Sub-Bituminous Coal"
[4] "Metallurgical Coal"  "Bituminous Coal"     "Thermal Coal"       
[7] "Anthracite Coal"     "Cement"              "Lignite Coal"       
# NGL stands for Natural Gas Liquid.

How many kinds of owners are included in this dataset?

unique(df_cleaned$owner_type)
[1] "State-owned Entity"     "Investor-owned Company" "Nation State"          

Questions

Q1

Q1: For the most recent surveyed year,list the Top 10 companies that accounted for the largest portion of emissions.

df5<- df_cleaned |>
  filter(year==max(year))|>
  group_by(company)|>
  summarise(n1=sum(emissions))|>
  mutate(percentage=n1/sum(n1)*100)|>
  arrange(desc(percentage))

head(df5,n=10)
# A tibble: 10 × 3
   company                            n1 percentage
   <chr>                           <dbl>      <dbl>
 1 China (Coal)                   12290.      32.6 
 2 Saudi Aramco                    1962.       5.20
 3 Coal India                      1407.       3.73
 4 Gazprom                         1255.       3.32
 5 National Iranian Oil Co.        1209.       3.20
 6 Russian Federation              1072.       2.84
 7 China (Cement)                  1050        2.78
 8 Rosneft                          934.       2.47
 9 Abu Dhabi National Oil Company   705.       1.87
10 CNPC                             702.       1.86

Q2

For the recent 10 years, list the Top 10 companies that accounted for the largest portion of emissions.

df5<- df_cleaned |>
  filter(year>="2013")|>
  group_by(company)|>
  summarise(n1=sum(emissions))|>
  mutate(percentage=n1/sum(n1)*100)|>
  arrange(desc(percentage))

head(df5,n=10)
# A tibble: 10 × 3
   company                             n1 percentage
   <chr>                            <dbl>      <dbl>
 1 China (Coal)                   104353.      29.2 
 2 Saudi Aramco                    18509.       5.17
 3 Gazprom                         14119.       3.95
 4 China (Cement)                  11782.       3.29
 5 Coal India                      11501.       3.21
 6 National Iranian Oil Co.        11027.       3.08
 7 Russian Federation               9774.       2.73
 8 Rosneft                          7884.       2.20
 9 CNPC                             7107.       1.99
10 Abu Dhabi National Oil Company   6393.       1.79

Q3

Q3: For the most recent surveyed year, of all the emissions, what percentage of carbon dioxide is emitted by different commodities of China (Coal)?

Bituminous Coal 22.92; Metallurgical Coal 4.61; Anthracite Coal 4.15; Lignite Coal 0.89

df8<-df_cleaned|>
  filter(year==max(year))|>
  mutate(total_emissions=sum(emissions))|>
  filter(company =="China (Coal)")|>
  group_by(commodity)|>
  mutate(each_emissions=sum(emissions))|>
  summarise(percentage=each_emissions/total_emissions*100)|>
  arrange(desc(percentage))

 df8$percentage<-round(df8$percentage,2)

Q4

Q4:For the recent 10 years, the trends in emissions of China (Coal), China (Cement) and CNPC.

df_cleaned|>
  filter(year >="2013") |>
  filter(company %in% c("China (Coal)", "China (Cement)","CNPC"))|>
  group_by(company, year)|>
  summarise(n=sum(emissions))|>
  ggplot(aes(x = year, y = n, color = company)) +
  geom_line(size = 1) +
  geom_point(size = 1.5) +
  scale_x_continuous(breaks = seq(2013, 2022, by = 1)) +
  scale_color_brewer(palette = "Set2") +
   labs(title = "Emissions of China (Coal), China (Cement), and CNPC",
       x = "Year",
       y = "Emissions(MtCO2e)",
       color = "Company") +
  theme_bw()+
   theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"))
`summarise()` has grouped output by 'company'. You can override using the
`.groups` argument.
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

ggsave("out/Summary-4.1.png", width = 10, height = 6, dpi = 300)

Q5

Q5: For the recent 10 years, for the emissions of China (Coal), China (Cement) and CNPC, which commodity has contributed most emissions?

df_cleaned|>
   filter(year>="2013" )|>
   filter(company %in% c("China (Coal)", "China (Cement)","CNPC"))|>
  group_by(commodity)|>
  summarise(n=sum(emissions))|>
  ggplot(aes(x =  fct_reorder(commodity,n), y = n,fill = commodity)) +
  geom_col() +
  geom_col(width = 0.7) +
   coord_flip() +
  scale_fill_brewer(palette = "Pastel2") +
  labs(title = "Emissions by commodity of the three Chinese companies",
       x = "Commdity",
       y = "Emissions (MtCO2e)")+
 theme_bw()+
  # Add values to the chart.
  geom_text(aes(label = n),hjust = -0.3, size = 3)+
   theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"))+
   # Extends the range of the y-axis so that it is larger than the actual data range, leaving more space for the labels.
   expand_limits(y = c(0, max(df_cleaned$emissions) * 10))

ggsave("out/Summary-2.3.png", width = 10, height = 6, dpi = 300)

Q6

Q6: For the recent 10 years, which commodity has contribute most to the emissions?

df_cleaned|>
  filter(year>="2013" )|>
  group_by(commodity)|>
  summarise(n1=sum(emissions))|>
  arrange(desc(n1))
# A tibble: 9 × 2
  commodity                n1
  <chr>                 <dbl>
1 Oil & NGL           106470.
2 Bituminous Coal      89149.
3 Natural Gas          74247.
4 Metallurgical Coal   24920.
5 Sub-Bituminous Coal  17200.
6 Anthracite Coal      14477.
7 Cement               13740.
8 Thermal Coal         10022.
9 Lignite Coal          7639.

Q7

Q7: A follow-up questions about Q6, trends in the emissions of different types of commodities over the past decade.

df_cleaned|>
  filter(year>="2013" )|>
  group_by(year,commodity)|>
  mutate(each_emissions=sum(emissions))|>
  ggplot(aes(x = year, y = each_emissions, color = commodity)) +
  geom_line(aes(color = commodity)) +
  geom_point(aes(color = commodity)) +
  scale_x_continuous(breaks = seq(2013, 2022, by = 1)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Global emissions of different commodities over the past decade",
       x = "Year",
       y = "Emissions (MtCO2e)",
       color = "Commodity") +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"),
        panel.grid.minor = element_line(color = "grey90"),
        panel.grid.major = element_line(color = "grey80"))

ggsave("out/Summary-2.1.png", width = 10, height = 6, dpi = 300)
p<- df_cleaned|>
  filter(year>="2013" )|>
  group_by(year,commodity)|>
  mutate(each_emissions=sum(emissions))|>
  ggplot(aes(x = year, y = each_emissions, color = commodity)) +
  geom_line(aes(color = commodity)) +
  geom_point(aes(color = commodity)) +
  scale_x_continuous(breaks = seq(2013, 2022, by = 1)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Global emissions of different commodities over the past decade",
       x = "Year",
       y = "Emissions (MtCO2e)",
       color = "Commodity") +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"),
        panel.grid.minor = element_line(color = "grey90"),
        panel.grid.major = element_line(color = "grey80"))

ggsave("out/Summary-2.1.png", width = 10, height = 6, dpi = 300)

Interactive plot.

library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
ggplotly(p)
ggsave("out/Summary-2.2.png", width = 10, height = 6, dpi = 300)

###Plot with facets

df_cleaned|>
  filter(year>="2013" )|>
  group_by(year,commodity)|>
  mutate(each_emissions=sum(emissions))|>
  ggplot(aes(x = year, y = each_emissions, color = commodity)) +
  geom_line(aes(color = commodity)) +
  geom_point(aes(color = commodity)) +
  scale_x_continuous(breaks = seq(2013, 2022, by = 2)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Emissions of different commodities over the past 10 years",
       x = "Year",
       y = "Emissions (MtCO2e)",
       color = "Commodity") +
   facet_wrap(~commodity) +
  theme_bw()

Q8

Q8: For the recent 10 years, what are the total emissions for each type of commodity?

df_cleaned|>
  filter(year>="2013" )|>
  group_by(commodity)|>
  summarise(each_emissions=sum(emissions))|>
  ggplot(aes(x = fct_reorder(commodity,each_emissions), y = each_emissions,fill = commodity)) +
  geom_col() +
  geom_col(width = 0.7) +
  # Use the geom_text function to add values to the bar chart.
  geom_text(aes(label = round(each_emissions)), hjust = -0.3, size = 3.8) + 
   coord_flip() +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Global emissions by commodity over the past decade",
       x = "Commdity",
       y = "Emissions (MtCO2e)")+
 theme_bw()+
  # Extends the range of the y-axis so that it is larger than the actual data range, leaving more space for the labels.
  scale_y_continuous(expand = expansion(mult = c(0,0.5)))

ggsave("out/Summary-2.5.png", width = 10, height = 6, dpi = 300)
df_cleaned|>
  filter(year>="2013" )|>
  group_by(commodity)|>
  summarise(each_emissions=sum(emissions))|>
  ggplot(aes(x = commodity, y = each_emissions,fill = commodity)) +
  geom_col() +
  geom_col(width = 0.5) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Global emissions by commodity over the past decade",
       x = "Commdity",
       y = "Emissions (MtCO2e)")+
  theme_bw()+
  # Remove the unnecessary legend and adjust the angle of the commodities to avoid words overlap.
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 20),hjust=2)
Warning in plot_theme(plot): The `hjust` theme element is not defined in the
element hierarchy.

Q9

Q9: For Natural Gas, over the past decade, calculate the quantity produced by companies of different owner_type in each year.

df_cleaned|>
  filter(year>="2013",commodity=="Natural Gas" )|>
  group_by(year,owner_type)|>
  mutate(total_production=sum(quantity))|>
  ggplot(aes(x = year, y = total_production, color = owner_type)) +
  geom_line(aes(color = owner_type)) +
  geom_point(aes(color = owner_type)) +
  scale_x_continuous(breaks = seq(2013, 2022, 1)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Production of Natural Gas of different companies",
       x = "Year",
       y = "Production(billion cubic feet)",
       color = "Owner_type") +
  theme_bw()

Q10

Q10: Two periods: (1)1854-1874, before the invention of internal combustion engine;(2) 1875-1894. Trends in the production of “Oil & NGL” and the emissions

df_cleaned|>
  filter(year>="1854"& year<="1894")|>
  group_by(year)|>
  summarise(emissions_by_year=sum(emissions))|>
  ggplot(aes(x = year, y = emissions_by_year)) +
  geom_line() +
  geom_point(aes(color = emissions_by_year)) +
  scale_x_continuous(breaks = seq(1854, 1894, 5)) +
  labs(title = "Emissons before and after the internal combustion engine",
       x = "Year",
       y = "Emissions(MtCO2e)") +
  theme_bw()

df_cleaned|>
  filter(year>="1854"& year<="1894")|>
  filter(commodity=="Oil & NGL")|>
  group_by(year)|>
  summarise(quantity_by_year=sum(quantity))|>
  ggplot(aes(x = year, y = quantity_by_year)) +
  geom_line() +
  geom_point(aes(color = quantity_by_year)) +
  scale_x_continuous(breaks = seq(1854, 1894, 5)) +
  labs(title = "Production of Oil & NGL before and after the internal combustion engine",
       x = "Year",
       y = "Production") +
  theme_bw()

Q11

Q11: Trends in emissions by owner_type over the past decade.

df_cleaned |>
  filter(year >= "2013") |>
  group_by(year, owner_type) |>
  summarise(total = sum(emissions)) |>
  ggplot(aes(x = year, y = total, fill = owner_type)) +
  geom_col(width = 0.7) +
  scale_x_continuous(breaks = seq(2013, 2022, 1)) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Emissions by owner_type over the past decade",
       x = "Year",
       y = "Emissions (MtCO2e)") +
  theme_bw()
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

df_cleaned|>
  filter(year>="2013" )|>
  group_by(year,owner_type)|>
  summarise(total=sum(emissions))|>
  ggplot(aes(x = year, y = total, color = owner_type)) +
  geom_line(aes(color = owner_type)) +
  geom_point(aes(color = owner_type)) +
  scale_x_continuous(breaks = seq(2013, 2022, 1)) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Emissions by owner_type over the past decade",
       x = "Year",
       y = "Emissions(MtCO2e)",
       color = "Owner_type") +
  theme_bw()+
   theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"),
        panel.grid.minor = element_line(color = "grey90"),
        panel.grid.major = element_line(color = "grey80"))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

ggsave("out/Summary-3.png", width = 10, height = 6, dpi = 300)