Rebuilt the bot, moved old files into v1, new py file in main directo…

…ry and in v2.
NeotomaDB · Oct 20, 2021 · d7f4dea · d7f4dea
1 parent d27f9a3
commit d7f4dea
Show file tree

Hide file tree

Showing 11 changed files with 400 additions and 203 deletions.
diff --git a/Procfile b/Procfile
@@ -1 +1 @@
-worker: python neotomabot.py
+worker: python3 neotomabot.py
diff --git a/c b/c
diff --git a/neotomabot.py b/neotomabot.py
@@ -1,210 +1,76 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #!python3
+""" Neotoma Database Twitter Manager v2.0
+    by: Simon Goring
+    This Twitter bot is intended to provide updated information to individuals about additions to the Neotoma
+    Paleoecology database.  The script leverages the `schedule` package for Python, running continually in 
+    the background, sending out tweets at a specified time and interval.
+""" 
+
+from TwitterAPI import TwitterAPI
+import random
+import xmltodict
+import urllib.request
+import schedule
+import time
+import os
+
+twitstuff = {'consumer_key':os.environ['consumer_key'], 
+             'consumer_secret': os.environ['consumer_secret'],
+             'access_token_key':os.environ['access_token_key'],
+             'access_token_secret':os.environ['access_token_secret']}
+
+datasets = set()
+
+api = TwitterAPI(consumer_key=twitstuff['consumer_key'], 
+                 consumer_secret=twitstuff['consumer_secret'], 
+                 access_token_key=twitstuff['access_token_key'], 
+                 access_token_secret=twitstuff['access_token_secret'])
+
+def randomtweet(api):
+    """ Tweet a random statement from a plain text document. Passing in the twitter API object.
+        The tweets are all present in the file `resources/cannedtweets.txt`.  These can be edited
+        directly on GitHub if anyone chooses to.
+    """
+    with open('resources/cannedtweets.txt', 'r') as f:
+        alltweets = f.read().splitlines()
+        line = random.choice(alltweets)
+        api.request('statuses/update', {'status':line})
+
+def recentsite(api):
+    """ Tweet one of the recent data uploads from Neotoma. Passing in the twitter API object.
+        This leverages the v1.5 API's XML response for recent uploads.  It selects one of the new uploads
+        (except geochronology uploads) and tweets it out.  It selects them randomly, and adds the selected 
+        dataset to a set object so that values cannot be repeatedly tweeted out.
+    """
+    with urllib.request.urlopen('https://api.neotomadb.org/v1.5/data/recentuploads/1') as response:
+        html = response.read()
+        output = xmltodict.parse(html)['results']['results']
+        records = list(filter(lambda x: x['record']['datasettype'] != 'geochronology' or x['record']['datasetid'] not in datasets, output))
+    if len(records) > 0:
+        tweet = random.choice(records)['record']
+        while tweet['datasetid'] in datasets:
+            tweet = random.choice(records)['record']
+        string = "It's a new {datasettype} dataset from the {databasename} at {sitename}! https://data.neotomadb.org/{datasetid}".format(**tweet)
+        if len(string) < 280:
+            api.request('statuses/update', {'status':string})
+            datasets.add(tweet['datasetid'])
+        else:
+            string = "It's a new dataset from the {databasename} at {sitename}! https://data.neotomadb.org/{datasetid}".format(**tweet)
+            if len(string) < 280:
+                api.request('statuses/update', {'status':string})
+                datasets.add(tweet['datasetid'])
 
-import os, tweepy, time, sys, json, requests, random, imp, datetime, schedule, time, random
-
-def twit_auth():
-  #  Authenticate the twitter session.
-  #  Should only be needed once at the initiation of the code.
-
-    CONSUMER_KEY = os.environ['CONSUMER_KEY']
-    CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
-    ACCESS_KEY = os.environ['ACCESS_KEY']
-    ACCESS_SECRET = os.environ['ACCESS_SECRET']
-
-    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
-    auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
-    api = tweepy.API(auth)
-    print('Twitter authenticated \n')
-    return api
-
-
-def check_neotoma():
-    # This function call to neotoma, reads a text file, compares the two
-    # and then outputs all the 'new' records to a different text file.
-  #  Function returns the number of new records returned.
-
-    #  inputs:
-    #  1. text file: old_results.json
-    #  2. text file: to_print.json
-    #  3. json call: neotoma
-
-    with open('old_results.json', 'r') as old_file:
-        old_calls = json.loads(old_file.read())
-
-    with open('to_print.json', 'r')    as print_file:
-        to_print  = json.loads(print_file.read())
-
-    neotoma  = requests.get("http://ceiwin10.cei.psu.edu/NDB/RecentUploads?months=1")
-    inp_json = json.loads(neotoma.text)['data']
-
-    def get_datasets(x):
-        did = []
-        for y in x:
-            did.append(y["DatasetID"])
-        return did
-
-    neo_datasets = get_datasets(inp_json)
-    old_datasets = get_datasets(old_calls)
-    new_datasets = get_datasets(to_print)
-
-    #  So this works
-    #  We now have the numeric dataset IDs for the most recent month of
-    #  new files to neotoma (neo_datasets), all the ones we've already tweeted
-    #  (old_datasets) and all the ones in our queue (new_datasets).
-    #
-    #  The next thing we want to do is to remove all the neo_datasets that
-    #  are in old_datasets and then remove all the new_datasets that are
-    #  in neo_datasets, append neo_datasets to new_datasets (if new_datasets
-    #  has a length > 0) and then dump new_datasets.
-    #
-    #  Old datasets gets re-written when the tweets go out.
-
-    #  remove all the neo_datasets:
-    for i in range(len(neo_datasets)-1, 0, -1):
-        if neo_datasets[i] in old_datasets:
-            del inp_json[i]
-
-    # This now gives us a pared down version of inp_json
-    # Now we need to make sure to add any of the to_print to neo_dataset.
-    #  We do this by cycling through new_datasets.  Any dataset number that
-    #  is not in old_datasets or neo_datasets gets added to the beginning of
-    #  the new list.  This way it is always the first called up when twitter
-    #  posts:
-
-    for i in range(0, len(new_datasets)-1):
-        if new_datasets[i] not in old_datasets and new_datasets[i] not in neo_datasets:
-            inp_json.insert(0,to_print[i])
-
-    #  Now write out to file.  Old file doesn't get changed until the
-    #  twitter app is run.
-    with open('to_print.json', 'w') as print_file:
-        json.dump(inp_json, print_file)
-    return len(inp_json) - len(to_print)
-
-def print_neotoma_update(api):
-  # Check for new records by using the neotoma "recent" API:
-  old_toprint = check_neotoma()
-
-  # load files:
-  with open('to_print.json', 'r') as print_file:
-    to_print  = json.loads(print_file.read())
-  with open('old_results.json', 'r') as print_file:
-    old_files  = json.loads(print_file.read())
-
-  print('Neotoma dataset updated.\n')
-  if (old_toprint) == 1:
-    #  If only a single site has been added:
-    line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet and " + str(old_toprint) + " site has been added since I last checked Neotoma. http://neotomadb.org"
-  elif (old_toprint) > 1:
-    line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet and " + str(old_toprint) + " sites have been added since I last checked Neotoma. http://neotomadb.org"
-  else:
-    line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet.  Nothing new has been added since I last checked. http://neotomadb.org"
-
-  print('%s' % line)
-  try:
-    print('%s' % line)
-    api.update_status(status=line)
-  except tweepy.error.TweepError:
-    print("Twitter error raised")
-
-def post_tweet(api):
-  # Read in the printable tweets:
-  with open('to_print.json', 'r') as print_file:
-    to_print  = json.loads(print_file.read())
-
-  with open('old_results.json', 'r') as print_file:
-    old_files  = json.loads(print_file.read())
-
-  print('Files opened\n')
-
-  pr_tw = random.randint(0,len(to_print) - 1)
-  site  = to_print[pr_tw]
-
-  #  Get ready to print the first [0] record in to_print:
-  weblink = 'http://apps.neotomadb.org/Explorer/?datasetid=' + str(site["DatasetID"])
-
-  #  The datasets have long names.  I want to match to simplify:
-
-  line = 'Neotoma welcomes ' + site["SiteName"] + ', a ' + site["DatasetType"] + ' dataset by ' + site["Investigator"] + " " + weblink
-
-  #  There's a few reasons why the name might be very long, one is the site name, the other is the author name:
-  if len(line) > 170:
-    line = 'Neotoma welcomes ' + site["SiteName"] + " by " + site["Investigator"] + " " + weblink
-
-  #  If it's still too long then clip the author list:
-  if len(line) > 170 & site["Investigator"].find(','):
-    author = site["Investigator"][0:to_print[0]["Investigator"].find(',')]
-    line = 'Neotoma welcomes ' + site["SiteName"] + " by " + author + " et al. " + weblink
-
-  try:
-    print('%s' % line)
-    api.update_status(status=line)
-    old_files.append(site)
-    del to_print[pr_tw]
-    with open('to_print.json', 'w')  as print_file:
-      json.dump(to_print, print_file)
-    with open('old_results.json', 'w')  as print_file:
-      json.dump(old_files, print_file)
-  except tweepy.error.TweepError:
-    print("Twitter error raised")
-
-
-def self_identify(api):
-
-  # Identify myself as the owner of the bot:
-  line = 'This twitter bot for the Neotoma Paleoecological Database is managed by @sjgoring. Letting you know what\'s new at http://neotomadb.org'
-  try:
-    print('%s' % line)
-    api.update_status(status=line)
-  except tweepy.error.TweepError:
-    print("Twitter error raised")
 
 def self_identify_hub(api):
-  # Identify the codebase for the bot:
-  line = 'This twitter bot for the Neotoma Paleoecological Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/SimonGoring/neotomabot'
-  try:
-    print('%s' % line)
-    api.update_status(status=line)
-  except tweepy.error.TweepError:
-    print("Twitter error raised")
-
-def other_inf_hub(api):
-  # Identify the codebase for the bot:
-  line = ['The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/SimonGoring/neotomabot',
-          'Neotoma has teaching modules you can use in the class room, check it out: https://www.neotomadb.org/education/category/higher_ed/',
-          'The governance for Neotoma includes representatives from our constituent databases. Find out more: https://www.neotomadb.org/about/category/governance',
-          'We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists',
-          'We keep a list of all publications that have used Neotoma for their research.  Want to be added?  Contact us! https://www.neotomadb.org/references',
-          'These days everyone\'s got a Google Scholar page.  So does Neotoma!  https://scholar.google.ca/citations?user=idoixqkAAAAJ&hl=en',
-          'If you use #rstats then you can access Neotoma data directly thanks to @rOpenSci! https://ropensci.org/tutorials/neotoma_tutorial.html',
-          'Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://www.neotomadb.org/data/category/explorer',
-          'Think you\'ve got better tweets? Add them to my code & make a pull request! https://github.com/SimonGoring/neotomabot',
-          'Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1',
-          'We\'ve got some new R tutorials up online.  Is there anything you\'d like to do with Neotoma? http://neotomadb.github.io',
-          'Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.',
-          'Are you presenting at an upcoming meeting?  Will you be talking about Neotoma?  Let us know and we can help get the word out! Contact @sjgoring',
-          'You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you\'re doing with Neotoma data!',
-          'Referencing Neotoma?  Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105',
-          'How is Neotoma leveraging text mining to improve its data holdings? Find out on the @earthcube blog: https://earthcube.wordpress.com/2018/03/06/geodeepdive-into-darkdata/',
-          "Building an application that could leverage Neotoma data?  Our API (https://api-dev.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/",
-          "The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage",
-          "Learn more about how Neotoma makes the most of teaching and cutting-edge research in a new publication in Elements of Paleontology: http://dx.doi.org/10.1017/9781108681582",
-          "Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ"
-          ]
-
-  try:
-    print('%s' % line)
-    api.update_status(status=line[random.randint(0,len(line))])
-  except tweepy.error.TweepError:
-    print("Twitter error raised")
+  """ Identify the codebase for the bot through a tweet. """
+  line = 'This twitter bot for the Neotoma Paleoecological Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot'
+  api.request('statuses/update', {'status':line})
 
-api = twit_auth()
 
-schedule.every(3).hours.do(post_tweet, api)
-schedule.every().day.at("15:37").do(print_neotoma_update, api)
-schedule.every().wednesday.at("14:30").do(self_identify, api)
+schedule.every(6).hours.do(recentsite, api)
+schedule.every(5).hours.do(randomtweet, api)
 schedule.every().monday.at("14:30").do(self_identify_hub, api)
 schedule.every().day.at("10:30").do(other_inf_hub, api)
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
-tweepy==3.7.0
-requests==2.21.0
-schedule==0.5.0
+schedule==1.1.0
+requests==2.22.0
+xmltodict==0.12.0
+tweepy==4.1.0
+TwitterAPI==2.7.5
diff --git a/resources/cannedtweets.txt b/resources/cannedtweets.txt
@@ -0,0 +1,18 @@
+The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot
+Neotoma has teaching modules you can use in the classroom, check it out: https://www.neotomadb.org/education/category/higher_ed/
+Governance for Neotoma includes representatives from our 34 constituent databases. Find out more: https://www.neotomadb.org/about/category/governance
+We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists
+There's a big @zotero library of Neotoma publications that we've been working on.  Check it out here: https://www.zotero.org/groups/2321378/neotomadb
+Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://apps.neotomadb.org/explorer
+Think you've got better tweets? Add them to my code & make a pull request! https://github.com/NeotomaDB/neotomabot
+Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1
+Our site at https://open.neotomadb.org hosts all our #openscience work, including a link to the database schema.  Check it out!
+Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.
+Are you presenting at an upcoming meeting?  Will you be talking about Neotoma?  Let us know and we can help get the word out! Contact @sjgoring
+You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you're doing with Neotoma data!
+Referencing Neotoma?  Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105
+How is Neotoma leveraging text mining to improve its data holdings? We've been working with @geodeepdive to discover articles that have yet to be submitted to the database. @earthcube
+Building an application that could leverage Neotoma data?  Our API (https://api.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/ #openscience
+The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage Check them out here: https://data.neotomadb.org
+Learn more about how Neotoma makes the most of teaching and cutting-edge research in our Elements of Paleontology publication: http://dx.doi.org/10.1017/9781108681582
+Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ
diff --git a/resources/cannedtwttes.txt b/resources/cannedtwttes.txt
@@ -0,0 +1,19 @@
+The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot
+Neotoma has teaching modules you can use in the class room, check it out: https://www.neotomadb.org/education/category/higher_ed/
+The governance for Neotoma includes representatives from our constituent databases. Find out more: https://www.neotomadb.org/about/category/governance
+We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists
+We keep a list of all publications that have used Neotoma for their research.  Want to be added?  Contact us! https://www.neotomadb.org/references
+These days everyone's got a Google Scholar page.  So does Neotoma!  https://scholar.google.ca/citations?user=idoixqkAAAAJ&hl=en
+Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://apps.neotomadb.org/explorer
+Think you've got better tweets? Add them to my code & make a pull request! https://github.com/NeotomaDB/neotomabot
+Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1
+Our site at https://open.neotomadb.org hosts all our #openscience work, including a link to the database schema.  Check it out!
+Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.
+Are you presenting at an upcoming meeting?  Will you be talking about Neotoma?  Let us know and we can help get the word out! Contact @sjgoring
+You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you're doing with Neotoma data!
+Referencing Neotoma?  Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105
+How is Neotoma leveraging text mining to improve its data holdings? We've been working with @geodeepdive to discover articles that have yet to be submitted to the database.
+Building an application that could leverage Neotoma data?  Our API (https://api.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/ #openscience
+The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage Check them out here: https://data.neotomadb.org
+Learn more about how Neotoma makes the most of teaching and cutting-edge research in our Elements of Paleontology publication: http://dx.doi.org/10.1017/9781108681582
+Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		worker: python neotomabot.py
		worker: python3 neotomabot.py