WB Project PDOs Data Preprocessing

Author

Luisa M. Mimmi

Warning

WORK IN PROGRESS! (Please expect unfinished sections, and unpolished code. Feedback is welcome!)

Set up

# Pckgs -------------------------------------
library(fs) # Cross-Platform File System Operations Based on 'libuv'
library(tidyverse) # Easily Install and Load the 'Tidyverse'
library(janitor) # Simple Tools for Examining and Cleaning Dirty Data
library(skimr) # Compact and Flexible Summaries of Data
library(here) # A Simpler Way to Find Your Files
library(paint) # paint data.frames summaries in colour
library(readxl) # Read Excel Files
library(tidytext) # Text Mining using 'dplyr', 'ggplot2', and Other Tidy Tools
library(SnowballC) # Snowball Stemmers Based on the C 'libstemmer' UTF-8 Library
library(rsample) # General Resampling Infrastructure
library(rvest) # Easily Harvest (Scrape) Web Pages
library(cleanNLP) # A Tidy Data Model for Natural Language Processing #

Data sources

WB Projects & Operations

World Bank Projects & Operations can be explored at:

The Accessibility Classification is public under Creative Commons Attribution 4.0

Process to ingest & preprocess raw PDO text data

  1. Retrieve manually ALL WB projects approved between FY 1947 and 2026 as of 31/08/2024 using simply the Excel button on this page: WBG Projects
  2. Split the dataset and keep only projs_train (50% of projects with PDO text, i.e. 4413 PDOs)
  3. Clean the projs_train dataset
  4. Obtain PoS tagging + tokenization with cleanNLP package (functions cnlp_init_udpipe() + cnlp_annotate()) and saved projs_train_t (cleaned version of projs_train dataset).

I had made a prior attempt to ingest the WB document data using the rvest package, following the work of Stephanie Tran project who created the function R/f_scrape_WB-OKR.R. However the WB website has changed and the function is not working anymore.

✅ Ingest Projects data (manually from *.xlsx file)

all_projects_as_of29ago2024 <- read_excel(here::here ("data", "raw_data", "project2","all_projects_as_of29ago2024.xls"), 
                                            col_names = FALSE,
                                            skip = 1) 
# Nomi delle colonne
cnames <- read_excel(here::here("data", "raw_data", "project2", 
                                "all_projects_as_of29ago2024.xls"), 
                         col_names = FALSE,
                         skip = 1,
                     n_max = 2) 
# file completo
all_proj <- read_excel(here::here("data", "raw_data", "project2", 
                                  "all_projects_as_of29ago2024.xls"), 
                         col_names = TRUE,
                         skip = 2) 
# Save as .RDS 
save(all_proj, file = here::here("data", "raw_data", "project2", 
                                 "all_projects_as_of29ago2024.Rdata") ) 
rm(all_projects_as_of29ago2024)

Explore Project mega file

paint::paint(cnames)
paint::paint(all_proj)

skimr::skim(all_proj$id) # complete_rate 1
skimr::skim(all_proj$pdo)  # complete_rate 0.503
skimr::skim(all_proj$boardapprovaldate) # complete_rate 0.779
skimr::skim(all_proj$closingdate) # complete_rate 0.707 

_______

CLEANING FILE PDOs

Recode all_proj

This data set has a lot of blank values, probably also bc some information was not collected way back in 1947… (e.g. PDO)

# Mess of data format weird in different ways in 2 cols: 
    # 1947-12-31 12:00:00 # closingdate
    # 8/3/1948 12:00:00 AM # closingdate
    # 
    # 1955-03-15T00:00:00Z # boardapprovaldate
 
# Mutate the date columns to parse the dates, handling different formats and blanks
all_proj_t <- all_proj %>%
   # 1) Parsed using parse_date_time() with mdy HMS and mdy HMSp to handle "MM/DD/YYYY HH:MM AM/PM" formats.
  mutate(across("closingdate", ~ if_else(
    . == "", 
    NA_POSIXct_,  # Return NA for blank entries
    parse_date_time(., orders = c("mdy HMS", "mdy HMSp"))
  )),
    # 2) Parsed using ymd_hms() because it follows the ISO 8601 format (e.g., "1952-04-29T00:00:00Z").
  across("boardapprovaldate", ~ if_else(
    . == "", 
    NA_POSIXct_,  # Return NA for blank entries
    ymd_hms(., tz = "UTC")  # Handle ISO 8601 format (e.g., "1952-04-29T00:00:00Z")
  ))) %>% 
   mutate(boardapproval_year = year(boardapprovaldate),
          boardapproval_month = month(boardapprovaldate)) %>% 
   mutate(boardapprovalFY = case_when( 
             boardapproval_month >= 1 & boardapproval_month < 7 ~ boardapproval_year,
             boardapproval_month >= 7 & boardapproval_month <= 12 ~ boardapproval_year +1)) %>% 
   relocate(boardapprovalFY, .after = boardapprovaldate ) %>% 
   mutate(closingdate_year = year(closingdate),
          closingdate_month = month(closingdate)) %>% 
   mutate(closingdateFY = case_when( 
             closingdate_month >= 1 & closingdate_month < 7 ~ closingdate_year,
             closingdate_month >= 7 & closingdate_month <= 12 ~ closingdate_year +1)) %>% 
   relocate(closingdateFY, .after = closingdate ) 

tabyl(all_proj$closingdate)   
tabyl(all_proj_t$closingdateFY)

tabyl(all_proj$boardapprovaldate)   
tabyl(all_proj_t$boardapprovalFY)

skimr::skim(all_proj_t$boardapprovalFY) # complete_rate 0.779
skimr::skim(all_proj_t$closingdateFY) # complete_rate 0.707

Recap content

A lot of PDOs are missing, especially from earlier FYs

  • 22,569 Projects’ IDs, of which…
  • 9,774 (non missing) PDOs … 49,7% are missing
  • 22,569-4981 = 17,588 (non missing) board approval FY … 22.1% are missing
    • ranging from 1947 - 2026

[FUN] Count missing/distinct values of columns

# Function to count missing values and distinct values in a subset of columns
f_recap_values <- function(data, columns) {
   # Select the subset of columns
   df_subset <- data %>% select(all_of(columns))

   # 1) Use skimr to skim the data
   skimmed <- skim(df_subset)
   
   # 2) Get the number of rows in the dataset
   total_rows <- nrow(df_subset)

   
   # 3) Calculate the number of distinct values for each column
   distinct_counts <- df_subset %>%
      summarise(across(everything(), n_distinct)) %>%
      pivot_longer(everything(), names_to = "skim_variable", values_to = "n_distinct")
   
   # Extract the relevant columns for column names, missing values, and distinct counts
   missing_table <- skimmed %>%  # 1) 
      select(skim_variable, n_missing) %>%
      # Add the total number of rows
      mutate(total_rows = total_rows) %>%  # 2) 
      # Join with distinct counts
      left_join(distinct_counts, by = "skim_variable") %>%  # 3) 
      relocate(n_distinct, n_missing, .after = total_rows) %>%
      mutate(missing_perc = round((n_missing/total_rows)*100, 1), 
             missing_perc = glue::glue("{missing_perc}%")) %>%
      arrange(desc(n_distinct))
   
   # Return the table
   return(missing_table) 
}

# exe use 
#f_recap_values(df, c("col1","col2"))
# Use the function on a subset of columns
f_recap_values(all_proj_t, c("id","pdo", "projectstatusdisplay", "boardapprovalFY", "regionname","sector1", "theme1", "projectfinancialtype"))

Which PDOs are missing?

Ideally, the PDOs are missing in a non systematic way. To check it, I compare the distributions across some key features of ALL PROJs v. NON-missing-PDO-PROJs

missing_pdo <- all_proj_t %>% 
   #select(id, pdo, countryname, projectstatusdisplay, lendinginstr, boardapprovalFY, projectfinancialtype) %>% 
   filter(is.na(pdo))

# Now I compare to get a sense of distribution in all_proj_t v. missing_pdo... 
tabyl(all_proj_t$projectstatusdisplay) %>%  adorn_pct_formatting()
tabyl(missing_pdo$projectstatusdisplay) %>%  adorn_pct_formatting()
# Region  (seem same)
tabyl(all_proj_t$regionname)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$regionname)  %>% adorn_pct_formatting() 
# FY (seem same)
tabyl(all_proj_t$boardapprovalFY) %>%  adorn_pct_formatting()
tabyl(missing_pdo$boardapprovalFY) %>%  adorn_pct_formatting()
# too many sectors... (seem same)
tabyl(all_proj_t$sector1)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$sector1)  %>% adorn_pct_formatting() 
# too little filled themes
tabyl(all_proj_t$theme1)  %>% adorn_pct_formatting() # 51.9% NA
tabyl(missing_pdo$theme1)  %>% adorn_pct_formatting() # 67.4%  NA
# Environmental Assessment Category
tabyl(all_proj_t$envassesmentcategorycode)  %>% adorn_pct_formatting() # 92.1%   NA
tabyl(missing_pdo$envassesmentcategorycode)  %>% adorn_pct_formatting() 
# Environmental and Social Risk
tabyl(all_proj_t$esrc_ovrl_risk_rate)  %>% adorn_pct_formatting() # 98.4%  NA
tabyl(missing_pdo$esrc_ovrl_risk_rate)  %>% adorn_pct_formatting() 
# by fin instrum 
tabyl(all_proj_t$projectfinancialtype)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$projectfinancialtype)  %>% adorn_pct_formatting() 
# by lending instrum
tabyl(all_proj_t$lendinginstr)  %>% adorn_pct_formatting()  # Spec Inv Loan 6567 |29.1%
tabyl(missing_pdo$lendinginstr)  %>% adorn_pct_formatting() # Spec Inv Loan 4928 |43.9%

[Chi-Squared Test in the case of projectfinancialtype]

# Chi-Square Test: if a discrete distribution of a sample reflects the population, you can use a chi-square goodness-of-fit test

# by fin instrum 
pop <- tabyl(all_proj_t$projectfinancialtype, show_na = TRUE)  %>% 
   adorn_pct_formatting() %>% 
   mutate(across(ends_with("percent"), ~parse_number(.)))
obs <- tabyl(missing_pdo$projectfinancialtype, show_na = TRUE)  %>% 
   adorn_pct_formatting() %>% 
   mutate(across(ends_with("percent"), ~parse_number(.)))

# ---- Sample data (observed counts)
# Sample counts for categories A, B, and C
observed <- obs[,"n"]
observed
# ---- Population data (expected proportions or counts)
# Proportions of A, B, C in the population
expected_proportions <- pop[, ("percent")]/100   
expected_proportions

# Total number of observations in the sample
total_sample_size <- sum(observed)  

# Perform the chi-square test
chisq_test <- chisq.test(observed, p = expected_proportions)

# Display the results
chisq_test
# data:  observed
# X-squared = 3421.4, df = 4, p-value < 2.2e-16

# low p-value (typically < 0.05) suggests that the sample distribution is significantly different from the population!!!!

[Chi-Squared Test in the case of lendinginstr]

# Chi-Square Test: if a discrete distribution of a sample reflects the population, you can use a chi-square goodness-of-fit test

# by fin instrum 
pop2 <- tabyl(all_proj_t$lendinginstr, show_na = TRUE)  %>% 
   adorn_pct_formatting() %>% 
   mutate(across(ends_with("percent"), ~parse_number(.)))
obs2 <- tabyl(missing_pdo$lendinginstr, show_na = TRUE)  %>% 
   adorn_pct_formatting() %>% 
   mutate(across(ends_with("percent"), ~parse_number(.)))

# ---- Sample data (observed counts)
# Sample counts for categories A, B, and C
observed2 <- obs2[,"n"]
observed2
# ---- Population data (expected proportions or counts)
# Proportions of A, B, C in the population
expected_proportions2 <- pop2[, ("percent")]/100   
expected_proportions2

# Total number of observations in the sample
total_sample_size <- sum(observed2)  

# Perform the chi-square test
chisq_test2 <- chisq.test(observed2, p = expected_proportions2)

# Display the results
chisq_test2
# data:  observed
# X-squared = 3421.4, df = 4, p-value < 2.2e-16

# low p-value (typically < 0.05) suggests that the sample distribution is significantly different from the population!!!!

Based on some available project features, I would say that even if many projects have missing feature value, PDO texts’ missingness seems to happen at random, except maybe for:

  • approval FY
  • projectfinancialtype (Grants, IBRD, etc) but there are many missing type (many more in with-PDO sample)…
  • lendinginstr specific Investment Loan are missing PDO in 4928 pr (43.9%).

Why?

_______

TEXT PREPROCESSING

Obtain Reduced df projs

For my purposes, it is safe to drop all the projects with missing PDO !

  • it turns out there are no Project Development Objectives spelled out until FY2001, so the usable population of projects (with PDO text) spans from FY 2001 to FY 2023.
  • I will also drop the projects with missing projectstatusdisplay (status)
  • … and boardapprovalFY (year of approval) > 2024 as incomplete data
projs <- all_proj_t %>%                    # 22,659
   filter(!is.na(pdo)) %>%                 # 11,353  = 11306 MISSING
   # filter(boardapprovalFY < 2001) %>%  # 7548
   # 4 proj have PDO == "." 
   filter( pdo != ".") %>%                  # 11,346
   filter( pdo != "-") %>%                  # 11,344
   filter( pdo != "NA") %>%                  # 11,344
   filter( pdo != "N/A") %>%                   # 11,342
   filter( pdo != "No change") %>%                # 11,340
   filter( pdo != "No change to PDO following restructuring.") %>%   # 11,337
   filter( pdo != "PDO remains the same.") %>%  # 11,335
   filter(!(pdo %in% c("TBD", "TBD.", "Objective to be Determined."))) %>%   # 11,326
   filter(!(pdo %in% c("XXXXXX", "XXXXX", "XXXX", "a"))) %>%   # 11,322 = VALID PDO AVAILABLE
   # No proj without status 
   filter(!is.na(projectstatusdisplay)) %>%  # 11,319  ---- 3 missing
   # No proj without approval FY 
   filter(!is.na(boardapprovalFY)) %>% # 9,143   ---- 2,176 missing
   # No project with approval FY >= 2024  
   filter(boardapprovalFY < 2024)  %>% # 8811  ----- 2508 
   select(id, pr_name = project_name, pdo, boardapprovalFY, closingdateFY,status = projectstatusdisplay, regionname, countryname, sector1, theme1 ,
          lendinginstr,env_cat = envassesmentcategorycode, ESrisk = esrc_ovrl_risk_rate ,curr_total_commitment  )   

tabyl(projs$boardapprovalFY)  %>% adorn_pct_formatting() # most NA

nrow(projs) # 8829 
paint(cnames) 
paint(projs)
# delete all in env EXCEPT for c(... )
rm(list=setdiff(ls(), c("projs", "cnames" )))

Duplicated PDOs 🤯

str(projs)
skim(projs$id)
skim(projs$pdo)

n_distinct(projs$id) # 8811
n_distinct(projs$pdo) # 7582  !!!!!!!! 

# Detect duplicated PDOs and the corresponding IDs
duplicates <- projs %>%
  group_by(pdo) %>%
  filter(n() > 1) %>%   # Filter PDOs that appear more than once
  select(id, pdo)       # Select only id and pdo columns

# View the duplicated PDOs and their corresponding IDs
nrow(duplicates) # 2235

# Count duplicates per each unique ID based on duplicated PDOs
count_dp <- duplicates %>%          # Filter PDOs that appear more than once
  group_by(pdo) %>%             # Group by id
  summarise(dup_count = n())   # Count occurrences of duplicated PDOs per id

min(count_dp$dup_count) # 2
max(count_dp$dup_count) # 9
mean(count_dp$dup_count) # 2.22332
n_distinct(duplicates$id)  # 2235
n_distinct(duplicates$pdo)  # 1006

There are 2235 projects with NON-UNIQUE PDO text in the dataset. In some cases, evidently, the same PDO is used for multiple projects (from a minimum of 2 to a maximum of 9 time!!!), most likely when there is a parent project or subsequent phases of the same.

Manual correction text [projs] 🤚🏻

(Here I re-wrote the PDO for some projects that had misspelled words, lacking spaces, some typos and HTML formatting characters. No substantial changes)

projs$pdo[projs$id == "P164414"] <- "The Multisector Development Policy Financing (DPF) intends to support Ukraine's highest priority reforms to move from economic stabilization to stronger and sustained economic growth by addressing deeper structural bottlenecks and governance challenges in key areas. Possible policy areas include : (i) strengthening private sector competitiveness, including reforming land markets and the financial sector; (ii) promoting sustainable and effective public services, including reforming pensions, social assistance, and health; and (iii ) improving governance, including reforming anticorruption institutions and tax administration. The financing  DPL or Policy Based Guarantee (PBG)."

projs$pdo[projs$id == "P111432"] <- "Project development objectives for RCIP 3 include the following: Malawi: Support the Recipient's efforts to improve the quality, availability and affordability of broadband within its territory for both public and private users. Mozambique: Support the Recipient's efforts to contribute to lower prices for international capacity and extend the geographic reach of broadband networks and to contribute to improved efficiency and transparency through eGovernment applications. Tanzania: Support the Recipient's efforts to: (i) lower prices for international capacity and extend the geographic reach of broadband networks; and (ii) improve the Government's efficiency and transparency through eGovernment applications."

projs$pdo[projs$id == "P252350"] <- "The Program Development Objective is to expand opportunities for the acquisition of quality, market-relevant skills in selected economic sectors. The selected economic sectors include Energy, Transport and Logistics, and Manufacturing (with a focus on ‘Made-Rwanda’ products such as construction materials, light manufacturing and agro-processing). Building skills to advance the country’s economic agenda is a key priority of the GoR’s ongoing Economic Development and Poverty Reduction Strategy-2 (EDPRS2) launched in 2013. EDPRS2 builds on the country’s Vision 2020 which seeks to transform the country by raising its per capita GDP to middle-income level by 2020. The Program is grounded in the Government of Rwanda’s (GoR) National Employment Programs (NEP) approved by Cabinet in 2014. NEP was designed to address the employment challenges in Rwanda and equip its population with the skills required to supporteconomic development. The main results areas of the operation are: (i) reinforcing governance of the skills development system; (ii) ensuring provision of quality training programs with market relevance; (iii) expanding opportunities for continuous upgrading of job-relevant skills for sustained employability; and (iv) capacity building for implementation. The Program will disburse against achievement of specific Disbursement Linked Results (DLRs) in these results areas"

projs$pdo[projs$id == "P143376"] <- "The Project main objective is to promote the reduction of climate change impacts in the Cerrado of the west of the Bahia State by: (i) promoting the environmental regularization of land holdings in the targeted municipalities and support actions to promote recovery of environmental liabilities; and (ii) strengthening the State's capacity to prevent and combat forest fires through the integration of local actors and promoting the adoption of sustainable production practices in the targeted municipalities."

projs$pdo[projs$id == "P129428"] <- "The development objective of this task is to support the Ministry of Finance and other agencies with the Government of the Dominican Republic in their efforts to improve public expenditure management. Specifically, the objectives to be financed are to: (i) design and implement and integrate multiyear planningand budgeting; (ii) design and implement a Treasury Single Account for the central administration; and (iii) improve public sector procurement policies and practices."

 
projs$pdo[projs$id == "P127665"] <- "This development policy loan supports the Government of Croatia's reform efforts with the aim to: (i) enhance fiscal sustainability through expenditure-based consolidation; and (ii) strengthen investment climate."

projs$pdo[projs$id == "P075523"] <- "The project’s development objectives are to: (i) enhance the economic, environmental and social sustainability of transport and coastal infrastructure assets; and (ii) manage those assets and natural resources and respond to national emergencies, through an effective partnership with private sector stakeholders."

projs$pdo[projs$id == "P098889"] <- "The project development objective will be changed to benefit cotton farmers and create the conditions for sustainable growth of cotton production in low-income areas of Tajikistan through improved policy environment an better access to finance."

projs$pdo[projs$id == "P151058"] <- "The objective of the proposed project is to support the development of a seafood cluster in Nouadhibou that promotes sustainable management of fisheries and generates value for the communities."

projs$pdo[projs$id == "P173680"] <- "The Program Development Objective is to support economic stability and inclusive growth in Rwanda by (i) promoting long-term fiscal sustainability, (ii) enhancing coverage and equity in access to critical social sector programs, (iii) enabling learning and development of children, and (iv) strengthening decentralized capacity and accountability for accelerated human capital development."

projs$pdo[projs$id == "P159132"] <- "The PDO is to strengthen capacity at NEPAD Agency to support TerrAfrica member countries on knowledge management and actions for scale-up Sustainable Land and Water Management (SLWM). The project objectives have been further specified along the project's three components: Component 1, Support for Regional Coalition Building: Support NEPAD Agency to reinforce the regional coalition that aligns assistance to countries on SLWM and related issues. Component 2, Support for Mutual Learning Among Member Countries:  Support NEPAD Agency's mandate to meet country demand for knowledge and best practice tools to improve land management and climate resilience while maintaining natural capital to reinforce on-going investment operations in countries, as well as national and secto r development planning. Component 3, Support for Member Countries' SLWM Investment Planning: Strengthen NEPAD Agency's role in mee ting country demand for expertise and tools to develop and implement SLWM work programs and national SLWM investment frameworks."


projs$pdo[projs$id == "P122642"] <- "The project development objectives are: (i) improve the administration of the public pension schemes; and (ii) pilot a modest social safety net program as a first step to developing a sustainable approach to safety nets in the country.  The project development objective is to: (i) improve the administration of the public pension schemes; and (ii) develop administrative systems for safety nets interventions, with focus on targeting and benefit payment delivery, while delivering cash benefits to the poorest families in targeted pilot districts."

projs$pdo[projs$id == "P155961"] <- "The Project Development Objective (PDO) is to improve the accessibility to best practices and new knowledge on fisheries management of selected SSA countries. More specifically, this project will: (a) Empower the AU to take the lead on a long lasting fisheries partnership mechanism for Africa, leading to regional policy reforms; (b) Support knowledge exchange, advisory services, data management and monitoring support for the various current and pipeline investments in the region to ensure that lessons from successes and failures of country and LME-level investments and newly generated knowledge on fisheries are adequately disseminated, for a common understanding of the fisheries reforms necessary to meet the WSSD and the 2014 Policy Framework targets; and (c) Support individual coastal countries to access financing to support policy reform and investment through technical support in preparation of fund ing proposals. Attention will be given to reflecting lessons learned from other similar projects, replicating successful interventions, and ensuring continued technical support during project implementation – so this portfolio becomes a series of case studies for the region. The Regional Partnership for African Fisheries Policy Reform is also a follow-up phase medium-sized project to the GEF/World Bank Strategic Partnership project, which was implemented by the AU between 2009 and 2011, and the last project planned un der the Program Framework Document “Strategic Partnership for Sustainable Fisheries Management in the Large Marine Ecosystems in Africa” (P125797). The Project will be financed through a US$2 million Bank-Executed GEF Trust Fund between 2015 and 2018 for a duration of 3 years. It will act as a regional knowledge and monitoring hub for a SSA GEF/World Bank portfolio of current and pipeline fisheries reform projects (“participating projects”) and related partner-supported activities, with the objective of improving the accessibility to best practices and new knowledge on fisheries management of selected SSA countries. Within this context, it is expected th at the Project will assist African coastal countries to make concrete progress towards achieving the fisheries and poverty reduction  targets set by the 2002 WSSD, the follow-up 2012 United Nations Conference on Sustainable Development (Rio+20), and the 2014 Policy  Framework and Reform Strategy for Fisheries and Aquaculture in Africa. The Project is designed around three thematic components as follows (see Annex C for further detail): Component 1: Coalition building (US$0.75 million) Component 2: Knowledge support (US$1 .05 million) Component 3: Portfolio support (US$0.20 million)."

projs$pdo[projs$id == "P112186"] <- "The project objective is to improve the institutional and infrastructure environment for farmers and herders, with a strong emphasis on the livestock sector. More specifically, the project will increase farmers productivity, particularly of livestock farmers in the project areas and reduce animal diseases that have a public health impact (e.g., brucellosis).  The project will provide critical capital investments, strengthen key support services, deliver appropriate know-how, and facilitate and support effective and sustainable management of the country's valuable pasture resources.  This would enable farmers and herders to improve the productivity, profitability and sustainability of their enterprises and thereby help reduce poverty. Through its specific support for a regionally focused demonstration program to control brucellosis, the project would in addition contribute significantly to improved public health. The revised PDO is to improve the Recipient's institutional and infrastructure environment for more productive, profitable and sustainable livestock and crop production by pasture users and smallholder farmers, as well as to reduce the economic impact of the zoonotic disease burden in the human population."


projs$pdo[projs$id == "P099010"] <- "The seventh poverty reduction support credit for Burkina Faso is the first of a third series of PRSCs in support of Burkina Faso's Poverty Reduction Strategy (PRS), adopted in 2000 and updated in 2004. The PRS and its latest Annual Progress Report cover the period 2006 to 2008. This new series of PRSCs (7, 8 and 9) will support the acceleration of real growth and the reduction of poverty incidence in Burkina Faso by: (i) improving the investment climate and promoting exports; (ii) improving access to basic social services through deepened decentralization and strengthened institutional capacity; and (iii) promoting efficiency, transparency and accountability in the use of public resources through enhanced public financial management."


projs$pdo[projs$id == "P069934"] <- "The development objectives of the Pernambuco Integrated Development: Education Quality Improvement Project are to (a) improve the quality, efficiency, and inclusiveness of the public education system; (b) modernize and strengthen the managerial, financial, and administrative capacity of the Secretariat of Education to set policies and guidelines for the sector and deliver public education efficiently; and (c) support the overall state modernization effort through interventions to be carried out in the Secretariat of Education and to be replicated in other state institutions."

projs$pdo[projs$id == "P082278"] <- "The objective of the proposed PAL series would be to enhance economic growth in Croatia through (i) improving the investment climate, and (ii) reducing size and improving efficiency of the public sector. This will require fiscal adjustment, which is a theme running through the proposed operation, given the Croatia's fiscal position. The proposed PAL would also support the Government in its EU accession and integration efforts, and help ensure that key economic reforms under the acquis are effectively implemented and enforced. The PAL series would specifically support: Improvements in the investment climate by re-launching the privatization or liquidation of state-owned enterprises, improving financial discipline of enterprises, reducing the cost of doing business, and strengthening the judiciary. Efforts to improve governance by rationalizing and improving efficiency of public administration, and improving public expenditure management. Measures to enhance fiscal sustainability of sector programs by addressing health financing, rationalizing social benefits, improving fiscal and social sustainability of the pension system, and fiscal sustainability of railways operations." #P082278

projs$pdo[projs$id == "P084302"] <- "The overarching project objective is to improve the efficiency and effectiveness of irrigation water distribution in three AWBs (Ghotlu, Nara and Left Bank), particularly with respect to measures of reliability, equity and user satisfaction. This would be achieved by: (a) deepening and broadening the institutional reforms that are already underway in Sindh; (b) improving the irrigation system in a systematic way covering key hydraulic infrastructure, main and branch canals, and distributaries and minors; and (c) enhancing long-term sustainability of irrigation system through participatory irrigation management and developing institutions for improving operation and maintenance of the system and cost recovery. The improved water management would lead to increased agricultural production, employment and incomes over some about 1.8 million ha or more than 30 percent of the irrigated area in Sindh, and one of the poorest regions of the country." #P084302

projs$pdo[projs$id == "P094233"] <- "The Project Development Objective is to support the adoption of environmentally friendly land use practices on 3,400 hectares in two key Atlantic Forest watersheds in Espirito Santo, thereby contributing to biodiversity conservation. Key indicators include: Increase in area (3,400 has) under sustainable land use practices’ (SLMs). Which will be accomplished in part through the: (i) establishment and implementation of institutional arrangements for payment mechanisms for watershed conservation; and (ii) supporting the adoption of Sustainable Land Management (SLM) practices through the implementation of Payment for Environmental Services mechanisms and the participation and capacity building of local actors. The Global Environment Objective is to reduce threats to globally important biodiversity (for example, the endangered Muriqui monkey) from agricultural production systems and increase habitat for species in two key rainforest watersheds of the Atlantic Forest. Key indicators include: Area of critical habitat restored and/or protected from encroachment; which will be accomplished in part through the: (i) implementation of sustainable market-based mechanisms to finance Protected Areas (PA) management and conservation of biodiversity in agricultural landscapes." #P094233

projs$pdo[projs$id == "P106641"] <- "The DPL program supports staged progress towards medium-term program outcomes related to two CPS objectives: promoting broadbased economic growth by means of measures designed to enhance Panama's competitiveness, and establishing modern public financial management systems and institutions." #P106641

projs$pdo[projs$id == "P116125"] <- "The program has the following objectives: (i) to support structural reforms that will ensure fiscal consolidation over the medium term; (ii) to protect priority investment programs and the upfront fiscal costs of structural reforms critical to meeting Poland's goals of convergence with the rest of the EU; and (iii) to mitigate the social cost of the economic crisis." #P116125

projs$pdo[projs$id == "P150038"] <- "The overall Project goal is to provide support to the CIS national statistical offices to strengthen their capacity in collection of regular labor statistical data on the basis of uniformed methodology; to enhance knowledge exchange at national and international levels and to upgrade CIS and ILO statistical databases on labor. Specific Project's objectives are: modernisation of current labor statistics systems in the CIS countries through enhancing capacity of their national statistical services and other agencies producers of labor data by increasing the use of modern data collection tools, international methodology, definitions, standards and classifications; upgrading of currently conducted surveys such as household-based labor force surveys and/or enterprise-based surveys, population census; development of methodology for implementation of labor force surveys on a regular basis, including data collection on labor migration and employment in informal sector of economy."  #P150038

projs$pdo[projs$id == "P131325"] <- "The overarching project objective is to improve the efficiency and effectiveness of irrigation water distribution in three AWBs (Ghotlu, Nara and Left Bank), particularly with respect to measures of reliability, equity and user satisfaction. This would be achieved by: (a) deepening and broadening the institutional reforms that are already underway in Sindh; (b) improving the irrigation system in a systematic way covering key hydraulic infrastructure, main and branch canals, and distributaries and minors; and (c) enhancing long-term sustainability of irrigation system through participatory irrigation management and developing institutions for improving operation and maintenance of the system and cost recovery. The improved water management would lead to increased agricultural production, employment and incomes over some about 1.8 million ha or more than 30 percent of the irrigated area in Sindh, and one of the poorest regions of the country." #P131325

projs$pdo[projs$id == "P162274"] <- "The objective of the Project is to support the efforts of the Recipient to: (i) demobilize members of armed groups of Rwandan origin and members of the Rwandan Defense Force; and (ii) provide socioeconomic reintegration support to said members following demobilization, with a particular focus on the provision of such support to female, child, and disabled ex-combatants." #P162274

projs$pdo[projs$id == "P162166"] <- "The project's development objective is to empower citizens through social accountability interventions, to improve the utilization of post-Ebola recovery funds in the health and the education sectors in targeted districts of the member country." #P162166
Get rid of HTML tags like

and

and return just the inner text (e.g., “some text.”)

projs <- projs %>%
  mutate(pdo = stringr::str_replace_all(pdo, "<[^>]+>", ""))

_______

SAMPLE SPLITTING

Splitting the sample into training, validation, and test sets. In case of validating ML models but also to work on something smaller.

# ensure we always get the same result when sampling (for convenience )
set.seed(12345)

# use `regionname` as strata 
tabyl(projs$regionname)

projs_split <- projs %>%
  # define the training proportion as 75%
   rsample::initial_validation_split(prop = c(0.50, 0.25),
                                     # ensuring both sets are balanced in gender
                                     strata = regionname)

# resulting 3 datasets
projs_train <- rsample::training(projs_split)
projs_val <- rsample::validation(projs_split)
projs_test <- rsample::testing(projs_split)

nrow(projs_train)
nrow(projs_val)
nrow(projs_test)
tabyl(projs_train$regionname)
tabyl(projs_val$regionname)
tabyl(projs_test$regionname)

_______

[from here I work on projs_train only]

_______

PoS tagging + tokenization with cleanNLP

PoS tagging with cleanNLP

Here’s the general process for tagging (or “annotating”) text with the cleanNLP package:

  1. Make a dataset where one column is the id (line number, chapter number, book+chapter, etc.), and another column is the text itself.
  2. Initialize the NLP tagger. You can use any of these:
    • cnlp_init_udpipe(): Use an R-only tagger that should work without installing anything extra (a little slower than the others, but requires no extra steps!)
    • cnlp_init_spacy(): Use spaCy (if you’ve installed it on your computer with Python)
    • cnlp_init_corenlp(): Use Stanford’s NLP library (if you’ve installed it on your computer with Java)
  3. Feed the data frame from step 1 into the cnlp_annotate() function and wait.
  4. Save the tagged data on your computer so you don’t have to re-tag it every time.

Prep HYPHENATED words

# Replace hyphens with a placeholder before annotation
pdo_train_to_tag <- projs_train %>%
   select(id, pdo) %>%
   mutate(pdo_2 = case_when(
      str_detect(pdo, "scale-up|scaled-up") ~ str_replace_all(pdo, "scale-up|scaled-up", "SCALEUPPLACEHOLDER"), # 17 
      str_detect(pdo, "follow-up") ~ str_replace_all(pdo, "follow-up", "FOLLOWUPPLACEHOLDER"), # 6
      str_detect(pdo, "know-how") ~ str_replace_all(pdo, "know-how", "KNOWHOWPLACEHOLDER"), # 5 
      str_detect(pdo, "Covid-19|COVID-19|covid-19") ~ str_replace_all(pdo, "Covid-19|COVID-19|covid-19", "COVIDPLACEHOLDER"), # ???
      # last !!!!!!!!
      str_detect(pdo, "(?<=\\p{L})-(?=\\p{L})") ~ str_replace_all(pdo, "-", "HYPHENWORD"), # 1396 
      TRUE ~ pdo
   )) 

[LOOONG] Annotate with cleanNLP::cnlp_annotate()

[Only the 1st time]

# # ---- Initialize the tagger
# cleanNLP::cnlp_init_udpipe()
# 
# # ---- Use the built-in R-based tagger
# pdo_train_tagged <- cleanNLP::cnlp_annotate(pdo_train_to_tag, 
#                                   text_name = "pdo_2", 
#                                   doc_name = "id")
# 
# # save the input data as .rds
# saveRDS(pdo_train_tagged, here("data","derived_data","pdo_train_tagged.rds"))

[Conditionally re-annotate]

# save the input data as .rds
saveRDS(pdo_train_to_tag, here("data","derived_data","pdo_train_to_tag.rds"))
# define the path to save the tagged data
input_path <- here("data","derived_data","pdo_train_to_tag.rds")
output_path <- here("data","derived_data","pdo_train_tagged.rds")
# ---- Initialize the tagger
cleanNLP::cnlp_init_udpipe()

# ---- Conditionally annotate the data only if it has changed since the last time
# Check if previous input data exists; if not, save it for the first time
if (!file.exists(input_path)) {
  # Save pdo_train_to_tag to initialize the tracking
  saveRDS(pdo_train_to_tag, input_path)
  message("Initial input saved.")
}

# Load the previous input if it exists
previous_input <- readRDS(input_path)

# Only re-run if the input data has changed
if (!identical(pdo_train_to_tag, previous_input)) {
  # Run the annotation
  pdo_train_tagged <- cleanNLP::cnlp_annotate(pdo_train_to_tag, 
                                              text_name = "pdo_2", 
                                              doc_name = "id")
  
  # Save the updated input and annotated result
  saveRDS(pdo_train_to_tag, input_path)
  saveRDS(pdo_train_tagged, output_path)
  
  message("Annotation updated.")
} else {
  # Load the previously saved output if data is unchanged
  if (file.exists(output_path)) {
    pdo_train_tagged <- readRDS(output_path)
    message("Loaded previous annotation.")
  } else {
    message("No previous annotation found. Please run the annotation first.")
  }
}

Manipulate the tagged data

# ---- Extract $token & Convert annotations to a tibble (data frame format)
pdo_train_tag_t <-  pdo_train_tagged$token %>% 
   # ... convert annotations to a tibble (data frame format)
   as_tibble()    # 225,360

# ---- Replace the placeholder back with a hyphen
pdo_train_tag_t <- pdo_train_tag_t %>% 
   mutate(word = token) %>%
   
   # Replace placeholders in the 'token' column
   mutate(token = case_when(
      str_detect(word, "HYPHENWORD") ~ str_replace_all(word, "HYPHENWORD", "-"),
      str_detect(word, "SCALEUPPLACEHOLDER") ~ str_replace_all(word, "SCALEUPPLACEHOLDER", "scale-up"),
      str_detect(word, "FOLLOWUPPLACEHOLDER") ~ str_replace_all(word, "FOLLOWUPPLACEHOLDER", "follow-up"),
      str_detect(word, "KNOWHOWPLACEHOLDER") ~ str_replace_all(word, "KNOWHOWPLACEHOLDER", "know-how"),
      str_detect(word, "COVIDPLACEHOLDER") ~ str_replace_all(word, "COVIDPLACEHOLDER", "covid-19"),
      TRUE ~ word
   )) %>%
   
   # Replace placeholders in the 'token_ws' column
   mutate(token_with_ws = case_when(
      str_detect(token_with_ws, "HYPHENWORD") ~ str_replace_all(token_with_ws, "HYPHENWORD", "-"),
      str_detect(token_with_ws, "SCALEUPPLACEHOLDER") ~ str_replace_all(token_with_ws, "SCALEUPPLACEHOLDER", "scale-up"),
      str_detect(token_with_ws, "FOLLOWUPPLACEHOLDER") ~ str_replace_all(token_with_ws, "FOLLOWUPPLACEHOLDER", "follow-up"),
      str_detect(token_with_ws, "KNOWHOWPLACEHOLDER") ~ str_replace_all(token_with_ws, "KNOWHOWPLACEHOLDER", "know-how"),
      str_detect(token_with_ws, "COVIDPLACEHOLDER") ~ str_replace_all(token_with_ws, "COVIDPLACEHOLDER", "covid-19"),   
      TRUE ~ token_with_ws
   )) %>%
   
   # Replace placeholders in the 'lemma' column
   mutate(lemma = case_when(
      str_detect(lemma, "HYPHENWORD") ~ str_replace_all(lemma, "HYPHENWORD", "-"),
      str_detect(lemma, "Scaleupplaceholder") ~ str_replace_all(lemma, "Scaleupplaceholder", "scale-up"),
      str_detect(lemma, "Followupplaceholder") ~ str_replace_all(lemma, "Followupplaceholder", "follow-up"),
      str_detect(lemma, "Knowhowplaceholder") ~ str_replace_all(lemma, "Knowhowplaceholder", "know-how"),
      str_detect(lemma, "Covidplaceholder") ~ str_replace_all(lemma, "Covidplaceholder", "covid-19"),
      TRUE ~ lemma
   )) %>%
 
   relocate(word, .after = doc_id) # 225,360

Check for duplicates

# check  
projs_train$id %>% n_distinct() # 4403
pdo_train_tagged$token$doc_id %>% n_distinct() # 4403  OK!
pdo_train_tag_t$doc_id %>% n_distinct() # 4403  OK! 

Re-add cols from projs_train to pdo_train_t

# ---- RE-add cols from `projs_train_token ` `
pdo_train_t <-  pdo_train_tag_t %>% 
   left_join(projs_train, by = c("doc_id" = "id"))  
# ---- Select cols 
pdo_train_t <- pdo_train_t %>% 
   select(
      proj_id=  doc_id                ,
      pdo,
      sid                   , # sentence ID
      tid                   , # token ID within sentence
      word         ,
      token                 , # Tokenized form of the token.
      token_with_ws         , # Token with trailing whitespace.
      lemma                 , # Lemmatized form of the token.
      upos                  , # Universal part-of-speech tag.
      xpos                  , # Language-specific part-of-speech tag
      feats                 , # Morphological features of the token.
      tid_source            , # Token ID in the source document.
      relation              , # Dependency relation to the head of the token.
      pr_name               , # Name of the parent token.
      pdo                   ,
      FY_appr = boardapprovalFY       ,
      FY_clos = closingdateFY         ,
      status                ,
      regionname            ,
      countryname           ,
      sector1               ,
      theme1                ,
      lendinginstr          ,
      env_cat               ,
      ESrisk                ,
      curr_total_commitment 
   )

Ensure tid is numeric

# Ensure token_id is numeric
pdo_train_t <- pdo_train_t %>% 
  mutate(tid = as.numeric(tid))  # Convert token_id to numeric

Add low case token

pdo_train_t <- pdo_train_t %>% 
  mutate(token_l = tolower(token)) %>% 
   relocate(token_l, .after = token) %>% 
   select(-token_with_ws) 

Restore variations of “hyphenword” with “-”

#Replace variations of "hyphenword" with "-"
pdo_train_t <- pdo_train_t %>% 
  mutate(
    lemma = str_replace_all(lemma, regex("hyphenword|hyphenwor", 
                                         ignore_case = TRUE), "-")
  )

_______

Save clean Proj PDO train dataset pdo_train_t

# inputs 
saveRDS(projs_train, here("data" , "derived_data", "projs_train.rds"))   #   4403  obs 
saveRDS(projs_test, here("data" , "derived_data", "proj_test.rds"))      #   2204 obs 
saveRDS(projs_val, here("data" , "derived_data", "projs_val.rds"))       #   2204 obs 
# projs_train + annotated pdo train 
saveRDS(pdo_train_t, here("data" , "derived_data", "pdo_train_t.rds"))   #  249 k   obs