Skip to content

Commit b5c7689

Browse files
committed
fix #237: all encodings default to utf8
* Testing revealed a few edge cases in config file reading and CSV file reading, also fixed. * Updated the release notes.
1 parent 3604bb5 commit b5c7689

File tree

7 files changed

+21
-14
lines changed

7 files changed

+21
-14
lines changed

RELEASE_NOTES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ These notes apply to v2.2rc1 of 2017-06-18.
1212

1313
[#236](https://github.com/adobe-apiplatform/user-sync.py/issues/236): Directory users can now be pushed directly to Adobe, rather than synchronized with a fetch of Adobe users. A new command-line argument `--strategy push` (as opposed to the default `--strategy sync`) controls this.
1414

15+
[#237](https://github.com/adobe-apiplatform/user-sync.py/issues/237): The default encoding for all inputs (config files, CSV files, LDAP attribute values) is now assumed to be `utf8` rather than ASCII. This is a backward-compatible change that makes it unnecessary (but still allowed) to specify `utf8` explicitly.
16+
1517
## Bug Fixes
1618

1719
This release contains bug fixes for:

examples/config files - basic/3 connector-ldap.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ group_member_filter_format: "(memberOf={group_dn})"
8383
# string_encoding specifies the Unicode string encoding used by the directory.
8484
# All values retrieved from the directory are converted to Unicode before being
8585
# sent to or compared with values on the Adobe side, to avoid encoding issues.
86-
# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8.
86+
# The value must be a Python codec name or alias, such as 'latin1' or 'big5'.
8787
# See https://docs.python.org/2/library/codecs.html#standard-encodings for details.
88-
#string_encoding: utf-8
88+
#string_encoding: utf8
8989

9090
# (optional) user_identity_type_format (no default)
9191
# user_identity_type_format specifies how to construct a user's desired identity

user_sync/app.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,10 @@ def process_args():
102102
"users by also including --adobe-only-user-action and one of its arguments",
103103
metavar='input_path', dest='stray_list_input_path')
104104
parser.add_argument('--config-file-encoding',
105-
help="config files are expected to contain only ASCII characters; if you "
106-
"use an extended character set (e.g., to specify group names), then "
107-
"specify the encoding of your configuration files with this argument. "
105+
help="configuration files are expected to be utf8-encoded (which includes ascii); if you "
106+
"use a different character set, then specify it with this argument. "
108107
"All encoding names understood by Python are allowed.",
109-
dest='encoding_name', default='ascii')
108+
dest='encoding_name', default='utf8')
110109
parser.add_argument('--strategy',
111110
help="whether to fetch and sync the Adobe directory against the customer directory "
112111
"or just to push each customer user to the Adobe side. Default is to fetch and sync.",

user_sync/config.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self, caller_options):
4646
self.options = options = {
4747
# these are in alphabetical order! Always add new ones that way!
4848
'delete_strays': False,
49-
'config_file_encoding': 'ascii',
49+
'config_file_encoding': 'utf8',
5050
'directory_connector_module_name': None,
5151
'directory_connector_overridden_options': None,
5252
'directory_group_filter': None,
@@ -616,8 +616,8 @@ class ConfigFileLoader:
616616
Loads config files and does pathname expansion on settings that refer to files or directories
617617
"""
618618
# config files can contain Unicode characters, so an encoding for them
619-
# can be specified as a command line argument. This defaults to ascii.
620-
config_encoding = 'ascii'
619+
# can be specified as a command line argument. This defaults to utf8.
620+
config_encoding = 'utf8'
621621

622622
# key_paths in the root configuration file that should have filename values
623623
# mapped to their value options. See load_from_yaml for the option meanings.
@@ -724,6 +724,12 @@ def load_from_yaml(cls, filename, path_keys):
724724
raise AssertionException("Error parsing configuration file '%s': %s" % (cls.filepath, e))
725725

726726
# process the content of the dict
727+
if yml is None:
728+
# empty YML files are parsed as None
729+
yml = {}
730+
elif not isinstance(yml, dict):
731+
# malformed YML files produce a non-dictionary
732+
raise AssertionException("Configuration file '%s' does not contain settings" % cls.filepath)
727733
for path_key, options in six.iteritems(path_keys):
728734
cls.key_path = path_key
729735
keys = path_key.split('/')

user_sync/connector/directory_csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(self, caller_options):
6060
caller_config = user_sync.config.DictConfig('%s configuration' % self.name, caller_options)
6161
builder = user_sync.config.OptionsBuilder(caller_config)
6262
builder.set_string_value('delimiter', None)
63-
builder.set_string_value('string_encoding', 'utf-8')
63+
builder.set_string_value('string_encoding', 'utf8')
6464
builder.set_string_value('first_name_column_name', 'firstname')
6565
builder.set_string_value('last_name_column_name', 'lastname')
6666
builder.set_string_value('email_column_name', 'email')

user_sync/connector/directory_ldap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def iter_search_result(self, base_dn, scope, filter_string, attributes):
323323

324324

325325
class LDAPValueFormatter(object):
326-
encoding = 'utf-8'
326+
encoding = 'utf8'
327327

328328
def __init__(self, string_format):
329329
"""

user_sync/helper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def guess_delimiter_from_filename(filename):
8686
return '\t'
8787

8888
@classmethod
89-
def read_csv_rows(cls, file_path, recognized_column_names=None, logger=None, encoding=None, delimiter=None):
89+
def read_csv_rows(cls, file_path, recognized_column_names=None, logger=None, encoding='utf8', delimiter=None):
9090
"""
9191
:type file_path: str
9292
:type recognized_column_names: list(str)
@@ -115,14 +115,14 @@ def read_csv_rows(cls, file_path, recognized_column_names=None, logger=None, enc
115115
# in py2, we need to decode both the column names *and* the values
116116
newrow = {}
117117
for key, val in six.iteritems(row):
118-
newrow[key.decode(encoding, 'strict')] = val.decode(encoding, 'strict')
118+
newrow[key.decode(encoding, 'strict')] = val.decode(encoding, 'strict') if val else None
119119
row = newrow
120120
yield row
121121
except UnicodeError as e:
122122
raise AssertionException("Encoding error in file '%s': %s" % (file_path, e))
123123

124124
@classmethod
125-
def write_csv_rows(cls, file_path, field_names, rows, encoding='ascii', delimiter=None):
125+
def write_csv_rows(cls, file_path, field_names, rows, encoding='utf8', delimiter=None):
126126
"""
127127
:type file_path: str
128128
:type field_names: list(str)

0 commit comments

Comments
 (0)