Skip to content

[#465] introduce irods.experimental.client.http module #466

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions irods/experimental/client/http/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
```
(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh page
---
[row(COLL_NAME='/'),
row(COLL_NAME='/tempZone'),
row(COLL_NAME='/tempZone/home')]
---
[row(COLL_NAME='/tempZone/home/alice'),
row(COLL_NAME="/tempZone/home/alice/a'b"),
row(COLL_NAME='/tempZone/home/public')]
---
[row(COLL_NAME='/tempZone/home/public/rods'),
row(COLL_NAME='/tempZone/home/public/thing'),
row(COLL_NAME='/tempZone/home/rods')]
---
[row(COLL_NAME='/tempZone/home/rods/c_files'),
row(COLL_NAME='/tempZone/home/rods/hello'),
row(COLL_NAME='/tempZone/trash')]
---
[row(COLL_NAME='/tempZone/trash/home'),
row(COLL_NAME='/tempZone/trash/home/alice'),
row(COLL_NAME='/tempZone/trash/home/public')]
---
[row(COLL_NAME='/tempZone/trash/home/rods')]
---
(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh iter
---
row(COLL_NAME='/')
---
row(COLL_NAME='/tempZone')
---
row(COLL_NAME='/tempZone/home')
---
row(COLL_NAME='/tempZone/home/alice')
---
row(COLL_NAME="/tempZone/home/alice/a'b")
---
row(COLL_NAME='/tempZone/home/public')
---
row(COLL_NAME='/tempZone/home/public/rods')
---
row(COLL_NAME='/tempZone/home/public/thing')
---
row(COLL_NAME='/tempZone/home/rods')
---
row(COLL_NAME='/tempZone/home/rods/c_files')
---
row(COLL_NAME='/tempZone/home/rods/hello')
---
row(COLL_NAME='/tempZone/trash')
---
row(COLL_NAME='/tempZone/trash/home')
---
row(COLL_NAME='/tempZone/trash/home/alice')
---
row(COLL_NAME='/tempZone/trash/home/public')
---
row(COLL_NAME='/tempZone/trash/home/rods')

>>>
```
230 changes: 230 additions & 0 deletions irods/experimental/client/http/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import collections
import enum
import functools
import itertools
import json
import logging
import requests
import sys
from .iterator_functions import *

logger = logging.getLogger(__name__)
MAX_INT32 = 2**31-1
DEFAULT_PAGE_SIZE = 512

# -----

# Abstractions that let us either page through a general query <count> items at a time,
# or treat it like a Pythonic generator aka stateful iterator.
# (See the README.md in this directory.)

# TODO: The README is temporary. Make some better docs.

class _pageable:
def __init__(self, callable_):
"""callable_ is a function-like object called without parameters.
It pages once through the set of query results and should be
stateful in terms of maintaining current offset within the query.
"""
self.callable_ = callable_
def next_page(self):
page = list(self.callable_())
return page

class _iterable(_pageable):
"""Adapts a pageable interface to return one query row at a time. An
empty [] returned from next_page signals the end of query results.
"""
@functools.wraps(_pageable.__init__)
def __init__(self,*_):
super().__init__(*_)
self.__P = None
self.index = 0
# Allow iter() on instances.
def __iter__(self): return self
def __next__(self):
"""Called implicitly by any iteration over the _iterable instance.
Returns one query row.
"""
if self.__P is None or self.index >= len(self.__P):
self.__P = self.next_page()
self.index = 0
if 0 == len(self.__P):
raise StopIteration
element = self.__P[self.index]
self.index += 1
return element

# -----

class HTTP_operation_error(RuntimeError):
pass

def _normalized_columns(columns):
if not isinstance(columns,(list,tuple)):
columns = filter(None, (_.strip() for _ in columns.split(',')))

# de-duplicate
columns = collections.OrderedDict((col,None) for col in columns)

col_names = tuple(columns.keys())
cls = collections.namedtuple('row', col_names)
return cls, ",".join(col_names)

class DataObject:
class column:
class enum(enum.Enum):
DATA_ID = 401
DATA_COLL_ID = 402
DATA_NAME = 403
DATA_REPL_NUM = 404
# TODO: complete this list
names = [k for k in enum.__members__.keys()]

class Collection:
class column:
class enum(enum.Enum):
COLL_ID = 500
COLL_NAME = 501
# TODO: complete this list
names = [k for k in enum.__members__.keys()]

# for heavyweight style of getter only!
def __init__(self, mgr, id_):
self.id = id_
self.mgr = mgr

@property
def name(self):
return self.mgr.value_by_column_name( self.id, 'COLL_NAME' )

# -----------------
# Manager/heavyweight approach to a catalog object "getter":
#
# This is an approximation of the old PRC approach
# for getting an instance of a collection by its nain
# identifying data, the logical pathname.
#
# We most likely will not be doing things this way.
# (See Session.data_object_replicas() method below.)

class Manager:
def __init__(self, session):
sess = self.sess = session

def value_by_column_name(self, id_, column_name:str):
first_row = one(self.sess.genquery1(columns = [column_name],
condition = "COLL_ID = '{}'", args = [id_]))
return getattr(first_row, column_name)

class CollManager(Manager):

def name_from_id(self, id_):
return one(self.sess.genquery1(columns = ['COLL_NAME'],
condition = "COLL_ID = '{}'", args = [id_])).COLL_NAME

def get(self, collname):
r = self.sess.genquery1( columns = 'COLL_ID',
condition = "COLL_NAME = '{}'", args = [collname] )
return Collection(self, int(one(r).COLL_ID))

# -----------------

class Session:

url_base_template = 'http://{self.host}:{self.port}/irods-http/{self.version}'

# Convenient object properties.

@property
def url_base(self):
return self.url_base_template.format(**locals())

def url(self, endpoint_name):
return self.url_base + "/" + endpoint_name.strip("/")

@property
def auth_header(self):
return {'Authorization': 'Bearer ' + self.bearer_token}

# Low-level basis for implementing an endpoint via HTTP 'GET'.

def http_get(self, endpoint_name, **param_key_value_pairs):
r = requests.get( self.url(endpoint_name),
headers = self.auth_header,
params = param_key_value_pairs )
if not r.ok:
raise HTTP_operation_error("Failed in GET.")
return r.content.decode()

# -----------------
# Thin/lightweight approach to catalog object "getter":
#
def data_object(self, logical_path, *,
query_options=(('offset',0),('count',DEFAULT_PAGE_SIZE))):
coll,data = logical_path.rsplit('/',1)
# TODO: embedded quotes in object names will not work here.
return self.genquery1(DataObject.column.names + Collection.column.names,
"COLL_NAME = '{}' and DATA_NAME = '{}'".format(coll,data),
extra_query_options=dict(query_options))

# Each endpoint can have its own method definition.

def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()):
"""Return a generator-style iterator over all row results.
Example:
for row in session.genquery1( 'COLL_NAME' ):
print(row.COLL_NAME)

By default, one HTTP call to the server returns a single "row", which is not`
terribly efficient. We can override the "count" option with an arbitrary
positive integer, effectively increasing the paging size for the query:

session.genquery1(columns, extra_query_options=dict(count=512)).

Since this function's result (a row-wise iterator) is page-size agnostic, its
usage is not altered, whereas the efficiency for large queries will greatly
improve due to the 512-fold decrease in the number of API calls.
"""
condition = condition.format(*args)
row_class, columns = _normalized_columns(columns)
where = '' if condition == '' else ' WHERE '

# d's default argument (being mutable) gets memoized in the context of the
# current closure, which persists beyond in the genquery1 call frame in which it
# originated and persists and across multiple calls to get_r.
# This can be leveraged to increment the query offset at the end of each get_r call
# by the length of the rows array retrieved.

def get_r(local_ = locals(), d = dict(extra_query_options)):
if 'offset' not in d:
d['offset'] = 0
d['offset'] = int(d['offset'])
result = self.http_get('/query',
op = "execute_genquery",
query = "SELECT {columns}{where}{condition}".format(**local_),
**d)
json_result = json.loads(result)
errcode = json_result['irods_response']['error_code']
if errcode != 0:
logger.warn('irods error code of [%s] in genquery1',errcode)
rows = [row_class(*i) for i in json_result['rows']]
d['offset'] += len(rows)
return rows

return _iterable(get_r)

def __init__(self, username, password, *,
host = 'localhost',
port = 9000,
version = '0.9.5'):

self.username = username
self.password = password
(self.host, self.port, self.version) = (host, port, version)
url = self.url_base + '/authenticate'
r = requests.post(url, auth = (self.username, self.password))
if not r.ok:
raise HTTP_operation_error("Failed to connect: url = '%s', status code = %s",
url, r.status_code)
self.bearer_token = r.text
26 changes: 26 additions & 0 deletions irods/experimental/client/http/iter_or_page.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

if [ $# -gt 0 ]; then
arg=${1:=iter}
else
echo >&2 "usage: $0 [page|iter]"; exit 1
fi

python -c "
import pprint
from irods.experimental.client.http import *
s = Session('rods','rods')
i = s.genquery1('COLL_NAME', condition='', args=(), extra_query_options=dict(count=3))
import sys
if sys.argv[1] == 'page':
while True:
print('---')
p = i.next_page()
if not p:
break
pprint.pprint(p)
elif sys.argv[1] == 'iter':
for j in i:
print('---')
pprint.pprint(j)
" ${arg}
33 changes: 33 additions & 0 deletions irods/experimental/client/http/iterator_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#/usr/bin/env python3
import itertools
import sys
import typing

class too_many_results(Exception): pass
class too_few_results(Exception): pass

__all__ = ['first_n','one','too_many_results','too_few_results']

def first_n(iterable: typing.Iterable, n: int):
return list(itertools.islice(iterable,n))

def one(iterable: typing.Iterable):
i = first_n(iterable,2)
if i[1:]:
raise too_many_results
if not i:
raise too_few_results
return i[0]

def test_one():
assert(
one(iter(range(10,10+i))) == 10
)

def test_first_n():
assert(
first_n(iter(range(10,10+i)),2) == [10,11]
)

if __name__=='__main__':
test_one()
39 changes: 39 additions & 0 deletions irods/prc_http_client_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pprint

from irods.experimental.client.http import *
from irods.experimental.client.http.iterator_functions import *

s = Session('rods','rods',host='prec3431')
c = CollManager(s).get("/tempZone/home/rods")

print ("Got a collection {c.name}, id = {c.id}".format(**locals()))

# Query collections by explicit column list.
result = s.genquery1(['COLL_ID', 'COLL_NAME'], # columns
"COLL_NAME like '%'", # condition
extra_query_options=dict(count='512'))
print("Result of collection query:\n"
"---------------------------\n")

result = list(result)
pprint.pprint(result)
print('Length of result was:',len(result))

# For a query of all data objects (note lack of condition argument), list full paths.
for row in s.genquery1('COLL_NAME,DATA_NAME',
extra_query_options=dict(count='512')):
print('path = {COLL_NAME}/{DATA_NAME}'.format(**row._asdict()))

# Fetch the data object requested.
data_path = "/tempZone/home/alice/new_alice.dat"

print ('-- fetch first replica --')

data_obj = first_n(s.data_object(data_path),n=1)
print(data_obj)

print ('-- fetch all replicas without paging --')

MAX_REPLICAS = 2**31-1
data_obj_replicas = list(s.data_object(data_path, query_options=dict(count=MAX_REPLICAS)))
pprint.pprint(data_obj_replicas)
Loading