From 91530122ce9d2a9d0d7467ea4c3b6e1d898c3d2d Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 28 Sep 2020 22:31:07 -0500 Subject: [PATCH 1/8] fix hadoop download URL in hadoop.sh --- scripts/hadoop.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/hadoop.sh b/scripts/hadoop.sh index 2939639..d1ef068 100755 --- a/scripts/hadoop.sh +++ b/scripts/hadoop.sh @@ -38,7 +38,7 @@ usage() { hadoop-download() { local hadoop='hadoop-2.9.2' cd "$(mktemp -d 2>/dev/null || mktemp -d -t 'hadoop')" - curl -O "https://www-us.apache.org/dist/hadoop/common/${hadoop}/${hadoop}.tar.gz" + curl -O "https://downloads.apache.org/hadoop/common/${hadoop}/${hadoop}.tar.gz" tar -xzf "${hadoop}.tar.gz" echo "$(pwd)/${hadoop}" } From 1904c509c58d3d5f1e9c26c8f768fe0b7236e19c Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 28 Sep 2020 22:33:07 -0500 Subject: [PATCH 2/8] glob implementation. --- hdfs/glob.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++ test/test_glob.py | 85 +++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 hdfs/glob.py create mode 100644 test/test_glob.py diff --git a/hdfs/glob.py b/hdfs/glob.py new file mode 100644 index 0000000..3dbe74c --- /dev/null +++ b/hdfs/glob.py @@ -0,0 +1,119 @@ +import fnmatch +import re + +from hdfs import Client + +import posixpath + + +def glob(client, hdfs_path): + """Return a list of paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + :param client: Instance of :class:`Client`. + :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. + + Sample usages: + + .. code-block:: python + + glob(client, './foo/bar/*') + glob(client, './foo/bar/file[0-9].txt') + glob(client, './foo/bar/file?.txt') + + """ + return list(iglob(client, hdfs_path)) + + +def iglob(client, hdfs_path): + """Return an iterator which yields the paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + :param client: Instance of :class:`Client`. + :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. + + Sample usages: + + .. code-block:: python + + iglob(client, './foo/bar/*') + iglob(client, './foo/bar/file[0-9].txt') + iglob(client, './foo/bar/file?.txt') + + """ + dirname, basename = posixpath.split(hdfs_path) + if not has_magic(hdfs_path): + if basename: + if client.status(hdfs_path, strict=False): + yield hdfs_path + else: + # Patterns ending with a slash should match only directories + if client.status(dirname)['type'] == 'DIRECTORY': + yield hdfs_path + return + if not dirname: + yield from glob1(client, None, basename) + return + # `os.path.split()` returns the argument itself as a dirname if it is a + # drive or UNC path. Prevent an infinite recursion if a drive or UNC path + # contains magic characters (i.e. r'\\?\C:'). + if dirname != hdfs_path and has_magic(dirname): + dirs = iglob(client, dirname) + else: + dirs = [dirname] + if has_magic(basename): + glob_in_dir = glob1 + else: + glob_in_dir = glob0 + for dirname in dirs: + for name in glob_in_dir(client, dirname, basename): + yield posixpath.join(dirname, name) + + +def glob1(client, dirname, pattern): + if not dirname: + if isinstance(pattern, bytes): + dirname = bytes(client.resolve('.'), 'ASCII') + else: + dirname = client.resolve('.') + names = client.list(dirname) + if not _ishidden(pattern): + names = [x for x in names if not _ishidden(x)] + return fnmatch.filter(names, pattern) + + +def glob0(client, dirname, basename): + if not basename: + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if client.status(dirname)['type'] == 'DIRECTORY': + return [basename] + else: + if client.status(posixpath.join(dirname, basename), strict=False): + return [basename] + return [] + + +magic_check = re.compile('([*?[])') +magic_check_bytes = re.compile(b'([*?[])') + + +def has_magic(s): + if isinstance(s, bytes): + match = magic_check_bytes.search(s) + else: + match = magic_check.search(s) + return match is not None + + +def _ishidden(path): + return path[0] in ('.', b'.'[0]) + diff --git a/test/test_glob.py b/test/test_glob.py new file mode 100644 index 0000000..d10960f --- /dev/null +++ b/test/test_glob.py @@ -0,0 +1,85 @@ +import posixpath + +from nose.tools import eq_ + +from hdfs.glob import glob +from util import _IntegrationTest + + +class TestGlob(_IntegrationTest): + + def setup(self): + super().setup() + self.__build_dirs() + + def __build_dirs(self): + """ + Structure: + + dir_1 + dir_1_1 + file_1_3_1.txt + dir_1_2 + file_1_3_1.txt + dir_1_3 + file_1_3_1.txt + file_1_3_2.txt + file_1_3_3.txt + file_1_1.txt + dir_2 + dir_2_1 + file_2_3_1.txt + dir_2_2 + file_2_3_1.txt + dir_2_3 + file_2_3_1.txt + file_2_3_2.txt + file_2_3_3.txt + file_2_1.txt + """ + self._write(posixpath.join('dir_1', 'dir_1_1', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_2', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_2.txt'), b'file_1_3_2') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_3.txt'), b'file_1_3_3') + self._write(posixpath.join('dir_1', 'file_1_1.txt'), b'file_1_1') + self._write(posixpath.join('dir_2', 'dir_2_1', 'file_2_3_1.txt'), b'file_2_3_1') + self._write(posixpath.join('dir_2', 'dir_2_2', 'file_2_2_1.txt'), b'file_2_2_1') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_1.txt'), b'file_2_3_1') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_2.txt'), b'file_2_3_2') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_3.txt'), b'file_2_3_3') + self._write(posixpath.join('dir_2', 'file_2_1.txt'), b'file_2_1') + + def test(self): + values = [ + ('./dir_1/dir_1_3/*', [ + './dir_1/dir_1_3/file_1_3_1.txt', + './dir_1/dir_1_3/file_1_3_2.txt', + './dir_1/dir_1_3/file_1_3_3.txt', + ]), + ('./dir_2/dir_2_3/file_2_3_?.txt', [ + './dir_2/dir_2_3/file_2_3_1.txt', + './dir_2/dir_2_3/file_2_3_2.txt', + './dir_2/dir_2_3/file_2_3_3.txt', + ]), + ('*/*.txt', [ + 'dir_1/file_1_1.txt', + 'dir_2/file_2_1.txt', + ]), + ('./dir_[1-2]/file_[1-2]_1.txt', [ + './dir_1/file_1_1.txt', + './dir_2/file_2_1.txt', + ]), + ('./dir_*/dir_*/file_[1-2]_3_2.txt', [ + './dir_1/dir_1_3/file_1_3_2.txt', + './dir_2/dir_2_3/file_2_3_2.txt', + ]), + ('./dir_[3-4]/file_[1-2]_1.txt', []), + ('./dir_*/dir_*/file_[3-4]_3_2.txt', []), + ] + for pattern, expected in values: + actual = glob(self.client, pattern) + eq_(expected, actual, 'Unexpected result for pattern ' + pattern) + + + From 48d2d2b5821adfa27ee404d8a194faff58fa162b Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 29 Sep 2020 09:01:37 -0500 Subject: [PATCH 3/8] get rid of "yield from" which is not supported in Python 2.7 --- hdfs/glob.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hdfs/glob.py b/hdfs/glob.py index 3dbe74c..5e4db20 100644 --- a/hdfs/glob.py +++ b/hdfs/glob.py @@ -60,7 +60,8 @@ def iglob(client, hdfs_path): yield hdfs_path return if not dirname: - yield from glob1(client, None, basename) + for p in glob1(client, None, basename): + yield p return # `os.path.split()` returns the argument itself as a dirname if it is a # drive or UNC path. Prevent an infinite recursion if a drive or UNC path From eb6f97d4e71d9db4bf660a7d7ddcb86f5f797597 Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 29 Sep 2020 09:19:23 -0500 Subject: [PATCH 4/8] fix backward compatibility issue with Python 2.7 in super() --- test/test_glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_glob.py b/test/test_glob.py index d10960f..bc8afe5 100644 --- a/test/test_glob.py +++ b/test/test_glob.py @@ -9,7 +9,7 @@ class TestGlob(_IntegrationTest): def setup(self): - super().setup() + super(TestGlob, self).setup() self.__build_dirs() def __build_dirs(self): From b008c433615c4e8d22769dbc16374bda891f9464 Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 29 Sep 2020 09:42:28 -0500 Subject: [PATCH 5/8] fix backward compatibility issue with Python 2.7 in bytes() --- hdfs/glob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hdfs/glob.py b/hdfs/glob.py index 5e4db20..448269e 100644 --- a/hdfs/glob.py +++ b/hdfs/glob.py @@ -82,7 +82,7 @@ def iglob(client, hdfs_path): def glob1(client, dirname, pattern): if not dirname: if isinstance(pattern, bytes): - dirname = bytes(client.resolve('.'), 'ASCII') + dirname = bytes(client.resolve('.')) else: dirname = client.resolve('.') names = client.list(dirname) From 109e3beec8a610c259bc86e78d39835fe6a9ad0f Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 16 Feb 2021 17:25:17 -0600 Subject: [PATCH 6/8] re-organize imports and indents in docstring. --- hdfs/__init__.py | 1 + hdfs/glob.py | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/hdfs/__init__.py b/hdfs/__init__.py index 88af8cc..b06e844 100644 --- a/hdfs/__init__.py +++ b/hdfs/__init__.py @@ -6,6 +6,7 @@ from .client import Client, InsecureClient, TokenClient from .config import Config, NullHandler from .util import HdfsError +from .glob import glob import logging as lg diff --git a/hdfs/glob.py b/hdfs/glob.py index 448269e..72bec41 100644 --- a/hdfs/glob.py +++ b/hdfs/glob.py @@ -1,8 +1,5 @@ import fnmatch import re - -from hdfs import Client - import posixpath @@ -17,13 +14,13 @@ def glob(client, hdfs_path): :param client: Instance of :class:`Client`. :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. - Sample usages: + Sample usages: - .. code-block:: python + .. code-block:: python - glob(client, './foo/bar/*') - glob(client, './foo/bar/file[0-9].txt') - glob(client, './foo/bar/file?.txt') + glob(client, './foo/bar/*') + glob(client, './foo/bar/file[0-9].txt') + glob(client, './foo/bar/file?.txt') """ return list(iglob(client, hdfs_path)) @@ -40,13 +37,13 @@ def iglob(client, hdfs_path): :param client: Instance of :class:`Client`. :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. - Sample usages: + Sample usages: - .. code-block:: python + .. code-block:: python - iglob(client, './foo/bar/*') - iglob(client, './foo/bar/file[0-9].txt') - iglob(client, './foo/bar/file?.txt') + iglob(client, './foo/bar/*') + iglob(client, './foo/bar/file[0-9].txt') + iglob(client, './foo/bar/file?.txt') """ dirname, basename = posixpath.split(hdfs_path) From 00b2a03f9988620ec178876b818dda487f0f6bf2 Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 16 Feb 2021 17:37:57 -0600 Subject: [PATCH 7/8] underscore all non-public functions. --- hdfs/glob.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hdfs/glob.py b/hdfs/glob.py index 72bec41..766f9cb 100644 --- a/hdfs/glob.py +++ b/hdfs/glob.py @@ -47,7 +47,7 @@ def iglob(client, hdfs_path): """ dirname, basename = posixpath.split(hdfs_path) - if not has_magic(hdfs_path): + if not _has_magic(hdfs_path): if basename: if client.status(hdfs_path, strict=False): yield hdfs_path @@ -57,26 +57,26 @@ def iglob(client, hdfs_path): yield hdfs_path return if not dirname: - for p in glob1(client, None, basename): + for p in _glob1(client, None, basename): yield p return # `os.path.split()` returns the argument itself as a dirname if it is a # drive or UNC path. Prevent an infinite recursion if a drive or UNC path # contains magic characters (i.e. r'\\?\C:'). - if dirname != hdfs_path and has_magic(dirname): + if dirname != hdfs_path and _has_magic(dirname): dirs = iglob(client, dirname) else: dirs = [dirname] - if has_magic(basename): - glob_in_dir = glob1 + if _has_magic(basename): + glob_in_dir = _glob1 else: - glob_in_dir = glob0 + glob_in_dir = _glob0 for dirname in dirs: for name in glob_in_dir(client, dirname, basename): yield posixpath.join(dirname, name) -def glob1(client, dirname, pattern): +def _glob1(client, dirname, pattern): if not dirname: if isinstance(pattern, bytes): dirname = bytes(client.resolve('.')) @@ -88,7 +88,7 @@ def glob1(client, dirname, pattern): return fnmatch.filter(names, pattern) -def glob0(client, dirname, basename): +def _glob0(client, dirname, basename): if not basename: # `os.path.split()` returns an empty basename for paths ending with a # directory separator. 'q*x/' should match only directories. @@ -104,7 +104,7 @@ def glob0(client, dirname, basename): magic_check_bytes = re.compile(b'([*?[])') -def has_magic(s): +def _has_magic(s): if isinstance(s, bytes): match = magic_check_bytes.search(s) else: From e0966a2453e8c26016ad5350c4d430a7b2895fbf Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 16 Feb 2021 18:00:15 -0600 Subject: [PATCH 8/8] add import of iglob to __init__.py --- hdfs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hdfs/__init__.py b/hdfs/__init__.py index b06e844..970722e 100644 --- a/hdfs/__init__.py +++ b/hdfs/__init__.py @@ -6,7 +6,7 @@ from .client import Client, InsecureClient, TokenClient from .config import Config, NullHandler from .util import HdfsError -from .glob import glob +from .glob import glob, iglob import logging as lg