Skip to content

Commit

Permalink
Working disk caching
Browse files Browse the repository at this point in the history
  • Loading branch information
compscidr committed Oct 26, 2024
1 parent a69ccad commit 39ca7fb
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 35 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This tool is inspired by [scholar.py](https://github.com/ckreibich/scholar.py)
```
import "github.com/compscidr/scholar"
sch := scholar.New()
sch := scholar.New("profiles.json", "articles.json")
articles := sch.QueryProfile("SbUmSEAAAAAJ", 1)
for _, article := range articles {
Expand All @@ -23,11 +23,11 @@ Working:
* Caches the profile for a day, and articles for a week (need to confirm this is working)
* This is in memory, so if the program is restarted, the cache is lost
* Configurable limit to number of articles to query in one go
* On-disk caching of the profile and articles to avoid hitting the rate limit

## TODO:
* Pagination of articles
* Add throttling to avoid hitting the rate limit (figure out what the limit is)
* Add on-disk caching so that if program restarts the cache is not lost

## Possible throttle info:
https://stackoverflow.com/questions/60271587/how-long-is-the-error-429-toomanyrequests-cooldown
3 changes: 3 additions & 0 deletions scholar-example/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
articles.json
profile.json

18 changes: 15 additions & 3 deletions scholar-example/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ func main() {
user := *userPtr
limit := *limitPtr

sch := scholar.New()
sch := scholar.New("profile.json", "articles.json")
//articles := sch.QueryProfileDumpResponse(user, limit, true)
//articles := sch.QueryProfile(user, limit)
articles := sch.QueryProfileWithCache(user, limit)
articles := sch.QueryProfileWithMemoryCache(user, limit)

if len(articles) == 0 {
fmt.Println("Not found")
Expand All @@ -37,7 +37,7 @@ func main() {
fmt.Println(article)
}

cachedArticles := sch.QueryProfileWithCache(user, limit)
cachedArticles := sch.QueryProfileWithMemoryCache(user, limit)
if len(articles) == 0 {
fmt.Println("Not found")
return
Expand All @@ -46,4 +46,16 @@ func main() {
for _, article := range cachedArticles {
fmt.Println(article)
}

sch.SaveCache("profile.json", "articles.json")
sch2 := scholar.New("profile.json", "articles.json")
cachedArticles2 := sch2.QueryProfileWithMemoryCache(user, limit)
if len(articles) == 0 {
fmt.Println("Not found")
return
}

for _, article := range cachedArticles2 {
fmt.Println(article)
}
}
159 changes: 130 additions & 29 deletions scholar.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ package go_scholar

import (
"bytes"
"encoding/json"
"fmt"
"github.com/PuerkitoBio/goquery"
cmap "github.com/orcaman/concurrent-map/v2"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"
)

Expand Down Expand Up @@ -46,49 +48,145 @@ type Profile struct {
}

type Scholar struct {
articles cmap.ConcurrentMap[string, Article] // map of articles by URL
profile cmap.ConcurrentMap[string, Profile] // map of profile by User string
articles sync.Map // map of articles by URL
profile sync.Map // map of profile by User string
}

func New() Scholar {
return Scholar{
articles: cmap.New[Article](),
profile: cmap.New[Profile](),
func New(profileCache string, articleCache string) *Scholar {

profileFile, err := os.Open(profileCache)
if err != nil {
println("Error opening profile cache file: " + profileCache + " - creating new cache")
return &Scholar{}
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
println("Error closing profile cache file: " + profileCache)
}
}(profileFile)
profileDecoder := json.NewDecoder(profileFile)
var regularProfileMap map[string]Profile
err = profileDecoder.Decode(&regularProfileMap)
if err != nil {
println("Error decoding profile file: " + profileCache + " - creating new cache")
return &Scholar{}
}

articleFile, err := os.Open(articleCache)
if err != nil {
println("Error opening article cache file: " + articleCache + " - creating new cache")
return &Scholar{}
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
println("Error closing article cache file: " + articleCache)
}
}(articleFile)
articleDecoder := json.NewDecoder(articleFile)
var regularArticleMap map[string]Article
err = articleDecoder.Decode(&regularArticleMap)
if err != nil {
println("Error decoding article cache file: " + articleCache + " - creating new cache")
return &Scholar{}
}

sch := Scholar{}

// convert the regular maps to sync maps
for key, value := range regularProfileMap {
sch.profile.Store(key, value)
}
for key, value := range regularArticleMap {
sch.articles.Store(key, value)
}

return &sch
}

func (sch *Scholar) SaveCache(profileCache string, articleCache string) {
profileFile, err := os.Create(profileCache)
if err != nil {
println("Error opening profile cache file: " + profileCache)
return
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
println("Error closing profile cache file: " + profileCache)
}
}(profileFile)
profileEncoder := json.NewEncoder(profileFile)
regularProfileMap := make(map[string]interface{})
sch.profile.Range(func(key, value interface{}) bool {
regularProfileMap[key.(string)] = value
return true
})
err = profileEncoder.Encode(regularProfileMap)
if err != nil {
println("Error encoding profile cache file: " + profileCache)
}

articleFile, err := os.Create(articleCache)
if err != nil {
println("Error opening article cache file: " + articleCache)
return
}
defer func(file *os.File) {
err := file.Close()
if err != nil {
println("Error closing profile cache file: " + articleCache)
}
}(articleFile)
articleEncoder := json.NewEncoder(articleFile)
regularArticleMap := make(map[string]interface{})
sch.articles.Range(func(key, value interface{}) bool {
regularArticleMap[key.(string)] = value
return true
})
err = articleEncoder.Encode(regularArticleMap)
if err != nil {
println("Error encoding cache file: " + articleCache)
}
}

func (a Article) String() string {
return "Article(\n Title=" + a.Title + "\n authors=" + a.Authors + "\n ScholarURL=" + a.ScholarURL + "\n Year=" + strconv.Itoa(a.Year) + "\n Month=" + strconv.Itoa(a.Month) + "\n Day=" + strconv.Itoa(a.Day) + "\n NumCitations=" + strconv.Itoa(a.NumCitations) + "\n Articles=" + strconv.Itoa(a.Articles) + "\n Description=" + a.Description + "\n PdfURL=" + a.PdfURL + "\n Journal=" + a.Journal + "\n Volume=" + a.Volume + "\n Pages=" + a.Pages + "\n Publisher=" + a.Publisher + "\n scholarCitedByURL=" + strings.Join(a.ScholarCitedByURLs, ", ") + "\n scholarVersionsURL=" + strings.Join(a.ScholarVersionsURLs, ", ") + "\n scholarRelatedURL=" + strings.Join(a.ScholarRelatedURLs, ", ") + "\n LastRetrieved=" + a.LastRetrieved.String() + "\n)"
}

func (sch Scholar) QueryProfile(user string, limit int) []Article {
func (sch *Scholar) QueryProfile(user string, limit int) []Article {
return sch.QueryProfileDumpResponse(user, true, limit, false)
}

func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
if sch.profile.Has(user) {
p, _ := sch.profile.Get(user)
lastAccess := p.LastRetrieved
func (sch *Scholar) QueryProfileWithMemoryCache(user string, limit int) []Article {

profileResult, profileOk := sch.profile.Load(user)
if profileOk {
profile := profileResult.(Profile)
lastAccess := profile.LastRetrieved
if (time.Now().Sub(lastAccess)).Seconds() > MAX_TIME_PROFILE.Seconds() {
println("Profile cache expired for User: " + user)
sch.profile.Remove(user)
sch.profile.Delete(user)
articles := sch.QueryProfileDumpResponse(user, true, limit, false)
var articleList []string
for _, article := range articles {
articleList = append(articleList, article.ScholarURL)
}
sch.profile.Set(user, Profile{User: user, LastRetrieved: time.Now(), Articles: articleList})
newProfile := Profile{User: user, LastRetrieved: time.Now(), Articles: articleList}
sch.profile.Store(user, newProfile)
} else {
println("Profile cache hit for User: " + user)
// cache hit, return the Articles
articles := make([]Article, 0)
for _, articleURL := range p.Articles {
if sch.articles.Has(articleURL) {
cacheArticle, _ := sch.articles.Get(articleURL)
for _, articleURL := range profile.Articles {
articleResult, articleOk := sch.articles.Load(articleURL)
if articleOk {
cacheArticle := articleResult.(Article)
if (time.Now().Sub(cacheArticle.LastRetrieved)).Seconds() > MAX_TIME_ARTICLE.Seconds() {
println("Cache expired for article: " + articleURL + "\nLast Retrieved: " + cacheArticle.LastRetrieved.String() + "\nDifference: " + time.Now().Sub(cacheArticle.LastRetrieved).String())
article := sch.QueryArticle(articleURL, Article{}, false)
sch.articles.Set(articleURL, article)
sch.articles.Store(articleURL, article)
articles = append(articles, article)
} else {
println("Cache hit for article: " + articleURL)
Expand All @@ -99,7 +197,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
println("Cache miss for article: " + articleURL)
article := sch.QueryArticle(articleURL, Article{}, false)
articles = append(articles, article)
sch.articles.Set(articleURL, article)
sch.articles.Store(articleURL, article)
}
}
return articles
Expand All @@ -112,7 +210,8 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
for _, article := range articles {
articleList = append(articleList, article.ScholarURL)
}
sch.profile.Set(user, Profile{User: user, LastRetrieved: time.Now(), Articles: articleList})
newProfile := Profile{User: user, LastRetrieved: time.Now(), Articles: articleList}
sch.profile.Store(user, newProfile)
return articles
}

Expand All @@ -127,7 +226,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
// want to get updated information from the profile page only to save requests
//
// if dumpResponse is true, it will print the response to stdout (useful for debugging)
func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, limit int, dumpResponse bool) []Article {
func (sch *Scholar) QueryProfileDumpResponse(user string, queryArticles bool, limit int, dumpResponse bool) []Article {
var articles []Article
client := &http.Client{}

Expand Down Expand Up @@ -171,26 +270,27 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
article.NumCitations, _ = strconv.Atoi(s.Find(".gsc_a_c").Children().First().Text())

if queryArticles {
if sch.articles.Has(BaseURL + tempURL) {
articleResult, articleOk := sch.articles.Load(BaseURL + tempURL)
if articleOk {
// hit the cache
cacheArticle, _ := sch.articles.Get(BaseURL + tempURL)
cacheArticle := articleResult.(Article)
if (time.Now().Sub(article.LastRetrieved)).Seconds() > MAX_TIME_ARTICLE.Seconds() {
println("Cache expired for article" + BaseURL + tempURL + "\nLast Retrieved: " + cacheArticle.LastRetrieved.String() + "\nDifference: " + time.Now().Sub(cacheArticle.LastRetrieved).String())
// expired cache entry, replace it
sch.articles.Remove(BaseURL + tempURL)
sch.articles.Delete(BaseURL + tempURL)
article = sch.QueryArticle(BaseURL+tempURL, article, dumpResponse)
sch.articles.Set(BaseURL+tempURL, article)
sch.articles.Store(BaseURL+tempURL, article)
} else {
println("Cache hit for article" + BaseURL + tempURL)
// not expired, update any new information
cacheArticle.NumCitations = article.NumCitations // update the citations since thats all that might change
article = cacheArticle
sch.articles.Set(BaseURL+tempURL, article)
sch.articles.Store(BaseURL+tempURL, article)
}
} else {
println("Cache miss for article" + BaseURL + tempURL)
article = sch.QueryArticle(BaseURL+tempURL, article, dumpResponse)
sch.articles.Set(BaseURL+tempURL, article)
sch.articles.Store(BaseURL+tempURL, article)
}
}
articles = append(articles, article)
Expand All @@ -199,7 +299,7 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
return articles
}

func (sch Scholar) QueryArticle(url string, article Article, dumpResponse bool) Article {
func (sch *Scholar) QueryArticle(url string, article Article, dumpResponse bool) Article {
fmt.Println("PULLING ARTICLE: " + url)
article.ScholarURL = url
client := &http.Client{}
Expand Down Expand Up @@ -274,7 +374,8 @@ func (sch Scholar) QueryArticle(url string, article Article, dumpResponse bool)
article.Articles += 1
articles := s.Find(".gsc_oci_value")
articles.Find(".gsc_oci_merged_snippet").Each(func(i int, s *goquery.Selection) {
// each one of these is an article. For an scholar-example with multiple see: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
// each one of these is an article. For a scholar-example with multiple see:
// https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
// this seems to happen if the entry is a book and there are Articles within it
s.Find(".gsc_oms_link").Each(func(i int, l *goquery.Selection) {
linkText := l.Text()
Expand Down
2 changes: 1 addition & 1 deletion scholar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ func TestScholarQuerier(t *testing.T) {
}

func TestProfileQuerier(t *testing.T) {
sch := New()
sch := New("cache.json")

Check failure on line 18 in scholar_test.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

not enough arguments in call to New

Check failure on line 18 in scholar_test.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

not enough arguments in call to New
articles := sch.QueryProfile("SbUmSEAAAAAJ", 1)
assert.NotEmpty(t, articles)

Expand Down

0 comments on commit 39ca7fb

Please sign in to comment.