This section shows some diagnostic plots for assessing the transformations and concordances.

## import country name lookup
lookup_matched <- read_rds(here::here("data/interim/003-matched_country_lookup.Rds"))

## import matched_series with country names
matched_series <- read_rds(here::here("data/interim/003-matched_tpp_indstat.Rds")) %>%
  left_join(x = .,
            y = lookup_matched,
            by = "country.match") %>%
  mutate(series.label = paste0(country.match, "~", name)) %>%
  mutate(data_source = factor(data_source, ordered = TRUE,
                              levels = c("TPP", "INDSTAT.REV3", "INDSTAT.REV4"),
                              labels = c("TPP", "REV3", "REV4")))

Comparison Plots

Annual Aggregate Gross Output by country and data source

  • Calculate annual gross output by country for each of the datasets (TPP, INSTAT Rev 3 and INDSTAT Rev 4)
  • Line-graph of annual gross output by country and datasets
  • For each country, the closer the series line-up the better the match and quality of final output series
## overall data-source trend -- add short name??
matched_byYear <- 
  matched_series %>%
  group_by(country.match, year, data_source) %>%
  summarise(n_ISIC2.3 = n(),
            total.value = sum(value.3),
            .groups = "drop_last") %>%
  ungroup() %>%
  group_by(country.match) %>%
  mutate(n_data_source = n_distinct(data_source)) %>%
  ungroup() %>%
  mutate(n_data_source = as.factor(n_data_source)) %>%
  left_join(x = .,
            y = lookup_matched,
            by = "country.match")
p.yearTrends_by_source <- 
  matched_byYear %>%
  ggplot(aes(x = year, 
             y = total.value, 
             colour = data_source)) +
  scale_color_discrete() +
  geom_line(size = 0.7, alpha = 0.3) +
  geom_point(size = 1, alpha = 0.7) +
  facet_wrap(vars(name, country.match), scales = "free_y", ncol = 1, strip.position = "right") 

p.yearTrends_by_source +
  theme(legend.position = "top")

Sectoral shares by country and dataset in overlapping years

  • For each country and year, the more similar the colour gradient of side by side bars the better the match and quality of final output series
p.sectorShares_by_year_source <- matched_series %>%
  group_by(country.match, year) %>%
  mutate(n_data_source = n_distinct(data_source)) %>%
  ungroup() %>%
  filter(n_data_source != 1) %>%
  ggplot(data = ., aes(x = data_source, y = value.3)) +
  geom_bar(position = "fill", stat = "identity", mapping = aes(fill = ISIC2.3)) +
  scale_fill_discrete() +
  facet_grid(rows = vars(country.match), cols = vars(year), scales = "free_y")

p.sectorShares_by_year_source +
  theme(axis.text.x = element_text(angle = 90),
        panel.spacing.x = unit(1, "points"),
        panel.border = element_rect(linetype = "solid", fill = NA),
        panel.background = element_rect(fill = NA),
        legend.position = "top")

Reference Table: Country Codes

Reference Code: Final Data Export

  • export data used in these plots as .csv
lookup_matched %>% write_csv(here::here("data/final/matched_country_lookup.csv"))
matched_series %>%
  select(data_source, country.match, year, ISIC2.3, value.3) %>%
  write_csv(here::here("data/final/matched_output.csv"))