first_report.Rmd

---
title: "PTM discovery dates"
output: html_notebook
---
Goal is to get information on first report of a PTM. Found this information on PIR. Want to scrape it, then cross-reference and complete with PUBMED queries

```{r setup}
library(tidyverse)
library(rentrez) #https://cran.r-project.org/web/packages/rentrez/vignettes/rentrez_tutorial.html
library(lubridate)
library(here)
library(rvest)
library(janitor)
library(feather)
library(beepr)
library(data.table)
library(vroom)
library(cowplot)

#rm(list=ls()) #clear environment

#how long?
start_time <- Sys.time()
```

#pir scrape
Get resids from PIR master page
http://pir0.georgetown.edu/cgi-bin/resid
```{r}
page <- read_html("http://pir0.georgetown.edu/cgi-bin/resid")
pir_raw <- page %>% 
  html_node("table") %>% 
  html_table(fill = TRUE)

pir <- pir_raw %>% 
  filter(str_detect(X1, "AA\\d{4}")) %>% 
  slice(-1:-2) %>% 
  select(X1:X7) %>% 
  rename(resid = X1,
         name	= X2, 
         sequence_spec = X3, 
         weight	= X4, 
         keyword = X5, 
         feature = X6, 
         enzyme = X7)
save(pir, file=here::here("data", "pir.RData"))

```

#page scrape
Get data from a single page 
http://pir0.georgetown.edu/cgi-bin/resid?id=AA0601
Then loop through all IDs
```{r eval=FALSE}
load(here::here("data", "pir.RData"))
pir_master <- tibble(
  id = character(), 
  data = list()
)

#id <- "AA0601"
resid <- pir$resid
#resid_short <- resid[1:3]
for (id in resid) {
page <- read_html(paste0("http://pir0.georgetown.edu/cgi-bin/resid?id=", id))
piraa_raw <- page %>% 
  html_nodes(".annot") %>% 
  html_text(trim = TRUE)

piraa <- tibble::enframe(piraa_raw) %>% 
  mutate(id = !!id) %>% 
  select(id, value) %>% 
  nest(data = c(value))

pir_master <- pir_master %>% 
  bind_rows(piraa)
}
save(pir_master, file=here::here("data", "pir_master.RData"))
beep(sound = 8) #because mario is awesome
```
# start here
```{r}
load(file=here::here("data", "pir_master.RData"))
load(file=here::here("data", "pir.RData"))


pir_unnested <- pir_master %>% 
  unnest(cols = c(data))

pir_clean <- pir_unnested %>% 
  filter(str_detect(value, "Reference"))

#pir_clean <- pir_unnested %>% 
#  separate(value, into = c("temp", "authors"), sep = "Authors\\:") 
#pir_clean$authors <- str_trim(pir_clean$authors, side = "left")
#%>% 
#separate(authors, into = c("authors", "title"), sep = "Title\\:") 

pir_clean <- pir_clean %>% 
  mutate(pmid = str_extract(value, "(?<=PMID\\:)\\d{1,8}")) %>% 
  mutate(year = str_extract(pir_clean$value, "\\d{4}(?=\\sTitle\\:)")) %>% 
  arrange(id, year) %>% 
  distinct(id, .keep_all = TRUE)


```

#join
```{r}
pir <- pir %>% 
  left_join(pir_clean, by = c("resid" = "id"))

#clean
pir <- map(pir, ~ na_if(., "&nbsp"))
```

#get systematic names
```{r}
pir_sys <- pir_unnested %>% 
  filter(str_detect(value, "Systematic"))
  
pir_sys <- pir_sys %>% 
  mutate(sys_name = str_extract(pir_sys$value, "(?<=name\\:).*(?=\\sCross)")) #.* matches any number of char

#str_extract(pir_sys$value, "(?<=name\\:).")
#str_extract(pir_sys$value, ".(?=\\sCross)")
```

#PUBMED
Next goal is to query pubmed to fill missing holes and cross reference

#explore
```{r}
entrez_db_searchable("pubmed")
```

#container
```{r eval=FALSE}
first <- tibble(i = character(), 
                id = numeric(),
                year = character())
```

#first search
```{r}
load(here::here("data", "ptm_vec.RData"))
#terms <- c("lysine butyrylation", "lysine acetylation", "lysine succinylation")
#terms <- sample(ptm_vec, 10)
terms <- ptm_vec #from ptm.Rmd

for (i in terms) {
  message("Getting entry for ", i)
  Sys.sleep(5) #add sleepy time according to https://www.ncbi.nlm.nih.gov/robots.txt
  
  num <- entrez_search(db="pubmed", 
                          term = i, 
                          retmax = 0) %>% #350K phosphorylation entries!!!
    purrr::pluck(., 2) 
  
  search <- entrez_search(db="pubmed", 
                          term = i, 
                          retmax = num) %>%
    purrr::pluck(., 1) 
  
  
  search <- as.double(search)
  
  if(length(search) == 0) {
    id <- 0
    year <- NA
  } else {
  id <- min(search) #pubmed ids start small, and count up
    year <- entrez_summary(db = "pubmed", id = id) %>% 
    purrr::pluck(., "pubdate") %>% 
    str_extract("\\d{4}") #extract first 4 digits from the date string to get year
  }
  tmp <- tibble(i, id, year)
  
  first <- first %>% 
    bind_rows(tmp)
  
}

beep(sound = 8) #because mario is awesome
save(first, file=here::here("data", "first.RData"))

#this will give me a pubmed ID for each of the first entries of a search term

#ac_search
#ac_search$ids

#write_csv(first, here::here("data", "first.csv"))

```

#manually set some dates, NAs, etc.
```{r}
first <- vroom(here::here("data", "first.csv"))
first <- first %>% 
  rename(ID = i, 
         pmid = id)
```

#plot
```{r}
first %>% 
  dplyr::mutate(year = as.numeric(year)) %>% 
  group_by(year) %>% 
  summarize(n = n()) %>% 
  ggplot() +
  geom_step(aes(x = year, y = cumsum(n), group = 1))
```

#merge dates

PTM data from https://www.uniprot.org/docs/ptmlist has RESID, so Run ptm.Rmd, then run remaining code below, to merge them all
```{r}
load(file=here::here("data", "ptm.RData"))
first_ptm <- ptm %>% 
  select(AC, ID, FT, KW, DR) %>% 
  left_join(pir_clean, by = c("DR" = "id"))
```

#merge with first
```{r}
first_ptm$ID <- str_trim(first_ptm$ID, side = "left")
first_ptm$year <- as.numeric(first_ptm$year)
first_ptm$pmid <- as.numeric(first_ptm$pmid)

first_ptm <- first_ptm %>% 
  left_join(first, by = "ID") 

first_ptm <- first_ptm %>% 
  mutate(year.x = replace_na(year.x, 2020), 
         year.y = replace_na(year.y, 2020))

first_ptm <- first_ptm %>% 
  mutate(year_final = if_else(year.x < year.y, year.x, year.y)) %>% 
  mutate(pmid_final = if_else(year.x < year.y, pmid.x, pmid.y)) %>% 
  mutate(year_final = na_if(year_final, 2020))

#clean
first_ptm <- first_ptm %>% 
  filter(str_detect(ID, "Cyclo", negate = TRUE), 
         str_detect(ID, "Blocked", negate = TRUE))

#manual curation
#<<- is normally used for global assignments. R might be looking for df in parent environment and not in the function itself.
curate <- function(ac, year, pubmed) {
  ac_index <- str_which(first_ptm$AC, as.character(ac))
  first_ptm[[ac_index, 11]] <<- as.numeric(year) 
  first_ptm[[ac_index, 12]] <<- as.numeric(pubmed)
}
curate("PTM-0499", 2016, 27105115)
curate("PTM-0675", 1982, 7115308)
curate("PTM-0487", 2014, 24703693)
curate("PTM-0193", 1975, 1184585)
#curate("PTM-0639", 1925, 25101001)

#get rid of dupes (some in early refs)

write_csv(first_ptm, path = here::here("output", "first_ptm.csv"))
```

#final plot
```{r}
first_ptm %>% 
  mutate(year_final = as.numeric(year_final)) %>% 
  group_by(year_final) %>% 
  summarize(n = n()) %>% 
  mutate(cumsum = cumsum(n)) %>% 
  ggplot() +
  geom_step(aes(x = year_final, y = cumsum, group = 1)) + 
  labs(x = "Year of Publication", y = "Cumulative Protein Modifications") +
  scale_x_continuous(limits = c(1920, 2020), breaks = seq(1920, 2020, by = 20)) +
  theme_half_open()

ggsave(filename = here::here("output", "fig2.pdf"), plot = last_plot(), width = 8, height = 6, units = "in", dpi = 300)
```


```{r}
beep(sound = 8) #because mario is awesome

#how long to scrape?
end_time <- Sys.time()
time_taken <- round(as.duration(start_time %--% end_time)/dminutes(1), digits = 1)
print(time_taken)

```