Skip to content

Commit

Permalink
Rebuilt the bot, moved old files into v1, new py file in main directo…
Browse files Browse the repository at this point in the history
…ry and in v2.
  • Loading branch information
SimonGoring committed Oct 20, 2021
1 parent d27f9a3 commit d7f4dea
Show file tree
Hide file tree
Showing 11 changed files with 400 additions and 203 deletions.
2 changes: 1 addition & 1 deletion Procfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
worker: python neotomabot.py
worker: python3 neotomabot.py
Binary file removed c
Binary file not shown.
264 changes: 65 additions & 199 deletions neotomabot.py
Original file line number Diff line number Diff line change
@@ -1,210 +1,76 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#!python3
""" Neotoma Database Twitter Manager v2.0
by: Simon Goring
This Twitter bot is intended to provide updated information to individuals about additions to the Neotoma
Paleoecology database. The script leverages the `schedule` package for Python, running continually in
the background, sending out tweets at a specified time and interval.
"""

from TwitterAPI import TwitterAPI
import random
import xmltodict
import urllib.request
import schedule
import time
import os

twitstuff = {'consumer_key':os.environ['consumer_key'],
'consumer_secret': os.environ['consumer_secret'],
'access_token_key':os.environ['access_token_key'],
'access_token_secret':os.environ['access_token_secret']}

datasets = set()

api = TwitterAPI(consumer_key=twitstuff['consumer_key'],
consumer_secret=twitstuff['consumer_secret'],
access_token_key=twitstuff['access_token_key'],
access_token_secret=twitstuff['access_token_secret'])

def randomtweet(api):
""" Tweet a random statement from a plain text document. Passing in the twitter API object.
The tweets are all present in the file `resources/cannedtweets.txt`. These can be edited
directly on GitHub if anyone chooses to.
"""
with open('resources/cannedtweets.txt', 'r') as f:
alltweets = f.read().splitlines()
line = random.choice(alltweets)
api.request('statuses/update', {'status':line})

def recentsite(api):
""" Tweet one of the recent data uploads from Neotoma. Passing in the twitter API object.
This leverages the v1.5 API's XML response for recent uploads. It selects one of the new uploads
(except geochronology uploads) and tweets it out. It selects them randomly, and adds the selected
dataset to a set object so that values cannot be repeatedly tweeted out.
"""
with urllib.request.urlopen('https://api.neotomadb.org/v1.5/data/recentuploads/1') as response:
html = response.read()
output = xmltodict.parse(html)['results']['results']
records = list(filter(lambda x: x['record']['datasettype'] != 'geochronology' or x['record']['datasetid'] not in datasets, output))
if len(records) > 0:
tweet = random.choice(records)['record']
while tweet['datasetid'] in datasets:
tweet = random.choice(records)['record']
string = "It's a new {datasettype} dataset from the {databasename} at {sitename}! https://data.neotomadb.org/{datasetid}".format(**tweet)
if len(string) < 280:
api.request('statuses/update', {'status':string})
datasets.add(tweet['datasetid'])
else:
string = "It's a new dataset from the {databasename} at {sitename}! https://data.neotomadb.org/{datasetid}".format(**tweet)
if len(string) < 280:
api.request('statuses/update', {'status':string})
datasets.add(tweet['datasetid'])

import os, tweepy, time, sys, json, requests, random, imp, datetime, schedule, time, random

def twit_auth():
# Authenticate the twitter session.
# Should only be needed once at the initiation of the code.

CONSUMER_KEY = os.environ['CONSUMER_KEY']
CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
ACCESS_KEY = os.environ['ACCESS_KEY']
ACCESS_SECRET = os.environ['ACCESS_SECRET']

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)
print('Twitter authenticated \n')
return api


def check_neotoma():
# This function call to neotoma, reads a text file, compares the two
# and then outputs all the 'new' records to a different text file.
# Function returns the number of new records returned.

# inputs:
# 1. text file: old_results.json
# 2. text file: to_print.json
# 3. json call: neotoma

with open('old_results.json', 'r') as old_file:
old_calls = json.loads(old_file.read())

with open('to_print.json', 'r') as print_file:
to_print = json.loads(print_file.read())

neotoma = requests.get("http://ceiwin10.cei.psu.edu/NDB/RecentUploads?months=1")
inp_json = json.loads(neotoma.text)['data']

def get_datasets(x):
did = []
for y in x:
did.append(y["DatasetID"])
return did

neo_datasets = get_datasets(inp_json)
old_datasets = get_datasets(old_calls)
new_datasets = get_datasets(to_print)

# So this works
# We now have the numeric dataset IDs for the most recent month of
# new files to neotoma (neo_datasets), all the ones we've already tweeted
# (old_datasets) and all the ones in our queue (new_datasets).
#
# The next thing we want to do is to remove all the neo_datasets that
# are in old_datasets and then remove all the new_datasets that are
# in neo_datasets, append neo_datasets to new_datasets (if new_datasets
# has a length > 0) and then dump new_datasets.
#
# Old datasets gets re-written when the tweets go out.

# remove all the neo_datasets:
for i in range(len(neo_datasets)-1, 0, -1):
if neo_datasets[i] in old_datasets:
del inp_json[i]

# This now gives us a pared down version of inp_json
# Now we need to make sure to add any of the to_print to neo_dataset.
# We do this by cycling through new_datasets. Any dataset number that
# is not in old_datasets or neo_datasets gets added to the beginning of
# the new list. This way it is always the first called up when twitter
# posts:

for i in range(0, len(new_datasets)-1):
if new_datasets[i] not in old_datasets and new_datasets[i] not in neo_datasets:
inp_json.insert(0,to_print[i])

# Now write out to file. Old file doesn't get changed until the
# twitter app is run.
with open('to_print.json', 'w') as print_file:
json.dump(inp_json, print_file)
return len(inp_json) - len(to_print)

def print_neotoma_update(api):
# Check for new records by using the neotoma "recent" API:
old_toprint = check_neotoma()

# load files:
with open('to_print.json', 'r') as print_file:
to_print = json.loads(print_file.read())
with open('old_results.json', 'r') as print_file:
old_files = json.loads(print_file.read())

print('Neotoma dataset updated.\n')
if (old_toprint) == 1:
# If only a single site has been added:
line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet and " + str(old_toprint) + " site has been added since I last checked Neotoma. http://neotomadb.org"
elif (old_toprint) > 1:
line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet and " + str(old_toprint) + " sites have been added since I last checked Neotoma. http://neotomadb.org"
else:
line = "I've got a backlog of " + str(len(to_print)) + " sites to tweet. Nothing new has been added since I last checked. http://neotomadb.org"

print('%s' % line)
try:
print('%s' % line)
api.update_status(status=line)
except tweepy.error.TweepError:
print("Twitter error raised")

def post_tweet(api):
# Read in the printable tweets:
with open('to_print.json', 'r') as print_file:
to_print = json.loads(print_file.read())

with open('old_results.json', 'r') as print_file:
old_files = json.loads(print_file.read())

print('Files opened\n')

pr_tw = random.randint(0,len(to_print) - 1)
site = to_print[pr_tw]

# Get ready to print the first [0] record in to_print:
weblink = 'http://apps.neotomadb.org/Explorer/?datasetid=' + str(site["DatasetID"])

# The datasets have long names. I want to match to simplify:

line = 'Neotoma welcomes ' + site["SiteName"] + ', a ' + site["DatasetType"] + ' dataset by ' + site["Investigator"] + " " + weblink

# There's a few reasons why the name might be very long, one is the site name, the other is the author name:
if len(line) > 170:
line = 'Neotoma welcomes ' + site["SiteName"] + " by " + site["Investigator"] + " " + weblink

# If it's still too long then clip the author list:
if len(line) > 170 & site["Investigator"].find(','):
author = site["Investigator"][0:to_print[0]["Investigator"].find(',')]
line = 'Neotoma welcomes ' + site["SiteName"] + " by " + author + " et al. " + weblink

try:
print('%s' % line)
api.update_status(status=line)
old_files.append(site)
del to_print[pr_tw]
with open('to_print.json', 'w') as print_file:
json.dump(to_print, print_file)
with open('old_results.json', 'w') as print_file:
json.dump(old_files, print_file)
except tweepy.error.TweepError:
print("Twitter error raised")


def self_identify(api):

# Identify myself as the owner of the bot:
line = 'This twitter bot for the Neotoma Paleoecological Database is managed by @sjgoring. Letting you know what\'s new at http://neotomadb.org'
try:
print('%s' % line)
api.update_status(status=line)
except tweepy.error.TweepError:
print("Twitter error raised")

def self_identify_hub(api):
# Identify the codebase for the bot:
line = 'This twitter bot for the Neotoma Paleoecological Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/SimonGoring/neotomabot'
try:
print('%s' % line)
api.update_status(status=line)
except tweepy.error.TweepError:
print("Twitter error raised")

def other_inf_hub(api):
# Identify the codebase for the bot:
line = ['The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/SimonGoring/neotomabot',
'Neotoma has teaching modules you can use in the class room, check it out: https://www.neotomadb.org/education/category/higher_ed/',
'The governance for Neotoma includes representatives from our constituent databases. Find out more: https://www.neotomadb.org/about/category/governance',
'We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists',
'We keep a list of all publications that have used Neotoma for their research. Want to be added? Contact us! https://www.neotomadb.org/references',
'These days everyone\'s got a Google Scholar page. So does Neotoma! https://scholar.google.ca/citations?user=idoixqkAAAAJ&hl=en',
'If you use #rstats then you can access Neotoma data directly thanks to @rOpenSci! https://ropensci.org/tutorials/neotoma_tutorial.html',
'Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://www.neotomadb.org/data/category/explorer',
'Think you\'ve got better tweets? Add them to my code & make a pull request! https://github.com/SimonGoring/neotomabot',
'Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1',
'We\'ve got some new R tutorials up online. Is there anything you\'d like to do with Neotoma? http://neotomadb.github.io',
'Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.',
'Are you presenting at an upcoming meeting? Will you be talking about Neotoma? Let us know and we can help get the word out! Contact @sjgoring',
'You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you\'re doing with Neotoma data!',
'Referencing Neotoma? Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105',
'How is Neotoma leveraging text mining to improve its data holdings? Find out on the @earthcube blog: https://earthcube.wordpress.com/2018/03/06/geodeepdive-into-darkdata/',
"Building an application that could leverage Neotoma data? Our API (https://api-dev.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/",
"The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage",
"Learn more about how Neotoma makes the most of teaching and cutting-edge research in a new publication in Elements of Paleontology: http://dx.doi.org/10.1017/9781108681582",
"Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ"
]

try:
print('%s' % line)
api.update_status(status=line[random.randint(0,len(line))])
except tweepy.error.TweepError:
print("Twitter error raised")
""" Identify the codebase for the bot through a tweet. """
line = 'This twitter bot for the Neotoma Paleoecological Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot'
api.request('statuses/update', {'status':line})

api = twit_auth()

schedule.every(3).hours.do(post_tweet, api)
schedule.every().day.at("15:37").do(print_neotoma_update, api)
schedule.every().wednesday.at("14:30").do(self_identify, api)
schedule.every(6).hours.do(recentsite, api)
schedule.every(5).hours.do(randomtweet, api)
schedule.every().monday.at("14:30").do(self_identify_hub, api)
schedule.every().day.at("10:30").do(other_inf_hub, api)

Expand Down
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
tweepy==3.7.0
requests==2.21.0
schedule==0.5.0
schedule==1.1.0
requests==2.22.0
xmltodict==0.12.0
tweepy==4.1.0
TwitterAPI==2.7.5
18 changes: 18 additions & 0 deletions resources/cannedtweets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot
Neotoma has teaching modules you can use in the classroom, check it out: https://www.neotomadb.org/education/category/higher_ed/
Governance for Neotoma includes representatives from our 34 constituent databases. Find out more: https://www.neotomadb.org/about/category/governance
We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists
There's a big @zotero library of Neotoma publications that we've been working on. Check it out here: https://www.zotero.org/groups/2321378/neotomadb
Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://apps.neotomadb.org/explorer
Think you've got better tweets? Add them to my code & make a pull request! https://github.com/NeotomaDB/neotomabot
Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1
Our site at https://open.neotomadb.org hosts all our #openscience work, including a link to the database schema. Check it out!
Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.
Are you presenting at an upcoming meeting? Will you be talking about Neotoma? Let us know and we can help get the word out! Contact @sjgoring
You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you're doing with Neotoma data!
Referencing Neotoma? Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105
How is Neotoma leveraging text mining to improve its data holdings? We've been working with @geodeepdive to discover articles that have yet to be submitted to the database. @earthcube
Building an application that could leverage Neotoma data? Our API (https://api.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/ #openscience
The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage Check them out here: https://data.neotomadb.org
Learn more about how Neotoma makes the most of teaching and cutting-edge research in our Elements of Paleontology publication: http://dx.doi.org/10.1017/9781108681582
Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ
19 changes: 19 additions & 0 deletions resources/cannedtwttes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
The bot for the Neotoma Database is programmed in #python and publicly available through an MIT License on GitHub: https://github.com/NeotomaDB/neotomabot
Neotoma has teaching modules you can use in the class room, check it out: https://www.neotomadb.org/education/category/higher_ed/
The governance for Neotoma includes representatives from our constituent databases. Find out more: https://www.neotomadb.org/about/category/governance
We are invested in #cyberinfrastructure. Our response to emerging challenges is posted on @authorea: https://www.authorea.com/users/152134/articles/165940-cyberinfrastructure-in-the-paleosciences-mobilizing-long-tail-data-building-distributed-community-infrastructure-empowering-individual-geoscientists
We keep a list of all publications that have used Neotoma for their research. Want to be added? Contact us! https://www.neotomadb.org/references
These days everyone's got a Google Scholar page. So does Neotoma! https://scholar.google.ca/citations?user=idoixqkAAAAJ&hl=en
Neotoma is more than just pollen & mammals; it contains 28 data types incl phytoliths & biochemistry data. Explore! https://apps.neotomadb.org/explorer
Think you've got better tweets? Add them to my code & make a pull request! https://github.com/NeotomaDB/neotomabot
Behold, the very first Neotoma dataset, ID 1: https://apps.neotomadb.org/explorer/?datasetid=1
Our site at https://open.neotomadb.org hosts all our #openscience work, including a link to the database schema. Check it out!
Neotoma is a member of the @ICSU_WDS, working to share best practices for data stewardship.
Are you presenting at an upcoming meeting? Will you be talking about Neotoma? Let us know and we can help get the word out! Contact @sjgoring
You know you want to slide into these mentions. . . Let us know what cool #pollen, #paleoecology, #archaeology, #whatever you're doing with Neotoma data!
Referencing Neotoma? Why not check out our Quaternary Research paper? https://doi.org/10.1017/qua.2017.105
How is Neotoma leveraging text mining to improve its data holdings? We've been working with @geodeepdive to discover articles that have yet to be submitted to the database.
Building an application that could leverage Neotoma data? Our API (https://api.neotomadb.org) is public and open: https://github.com/NeotomaDB/api_nodetest/ #openscience
The landing pages for Neotoma were built using Vue.js, all code is published on Github at https://github.com/NeotomaDB/ndbLandingPage Check them out here: https://data.neotomadb.org
Learn more about how Neotoma makes the most of teaching and cutting-edge research in our Elements of Paleontology publication: http://dx.doi.org/10.1017/9781108681582
Neotoma is on Slack. Come join the discussion and get involved! We're looking for folks to help with documentation, stewardship and coding. https://join.slack.com/t/neotomadb/shared_invite/zt-cvsv53ep-wjGeCTkq7IhP6eUNA9NxYQ
Loading

0 comments on commit d7f4dea

Please sign in to comment.