diff --git a/hdfs/__init__.py b/hdfs/__init__.py index 88af8cc..970722e 100644 --- a/hdfs/__init__.py +++ b/hdfs/__init__.py @@ -6,6 +6,7 @@ from .client import Client, InsecureClient, TokenClient from .config import Config, NullHandler from .util import HdfsError +from .glob import glob, iglob import logging as lg diff --git a/hdfs/glob.py b/hdfs/glob.py new file mode 100644 index 0000000..766f9cb --- /dev/null +++ b/hdfs/glob.py @@ -0,0 +1,117 @@ +import fnmatch +import re +import posixpath + + +def glob(client, hdfs_path): + """Return a list of paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + :param client: Instance of :class:`Client`. + :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. + + Sample usages: + + .. code-block:: python + + glob(client, './foo/bar/*') + glob(client, './foo/bar/file[0-9].txt') + glob(client, './foo/bar/file?.txt') + + """ + return list(iglob(client, hdfs_path)) + + +def iglob(client, hdfs_path): + """Return an iterator which yields the paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + :param client: Instance of :class:`Client`. + :param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. + + Sample usages: + + .. code-block:: python + + iglob(client, './foo/bar/*') + iglob(client, './foo/bar/file[0-9].txt') + iglob(client, './foo/bar/file?.txt') + + """ + dirname, basename = posixpath.split(hdfs_path) + if not _has_magic(hdfs_path): + if basename: + if client.status(hdfs_path, strict=False): + yield hdfs_path + else: + # Patterns ending with a slash should match only directories + if client.status(dirname)['type'] == 'DIRECTORY': + yield hdfs_path + return + if not dirname: + for p in _glob1(client, None, basename): + yield p + return + # `os.path.split()` returns the argument itself as a dirname if it is a + # drive or UNC path. Prevent an infinite recursion if a drive or UNC path + # contains magic characters (i.e. r'\\?\C:'). + if dirname != hdfs_path and _has_magic(dirname): + dirs = iglob(client, dirname) + else: + dirs = [dirname] + if _has_magic(basename): + glob_in_dir = _glob1 + else: + glob_in_dir = _glob0 + for dirname in dirs: + for name in glob_in_dir(client, dirname, basename): + yield posixpath.join(dirname, name) + + +def _glob1(client, dirname, pattern): + if not dirname: + if isinstance(pattern, bytes): + dirname = bytes(client.resolve('.')) + else: + dirname = client.resolve('.') + names = client.list(dirname) + if not _ishidden(pattern): + names = [x for x in names if not _ishidden(x)] + return fnmatch.filter(names, pattern) + + +def _glob0(client, dirname, basename): + if not basename: + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if client.status(dirname)['type'] == 'DIRECTORY': + return [basename] + else: + if client.status(posixpath.join(dirname, basename), strict=False): + return [basename] + return [] + + +magic_check = re.compile('([*?[])') +magic_check_bytes = re.compile(b'([*?[])') + + +def _has_magic(s): + if isinstance(s, bytes): + match = magic_check_bytes.search(s) + else: + match = magic_check.search(s) + return match is not None + + +def _ishidden(path): + return path[0] in ('.', b'.'[0]) + diff --git a/scripts/hadoop.sh b/scripts/hadoop.sh index 2939639..d1ef068 100755 --- a/scripts/hadoop.sh +++ b/scripts/hadoop.sh @@ -38,7 +38,7 @@ usage() { hadoop-download() { local hadoop='hadoop-2.9.2' cd "$(mktemp -d 2>/dev/null || mktemp -d -t 'hadoop')" - curl -O "https://www-us.apache.org/dist/hadoop/common/${hadoop}/${hadoop}.tar.gz" + curl -O "https://downloads.apache.org/hadoop/common/${hadoop}/${hadoop}.tar.gz" tar -xzf "${hadoop}.tar.gz" echo "$(pwd)/${hadoop}" } diff --git a/test/test_glob.py b/test/test_glob.py new file mode 100644 index 0000000..bc8afe5 --- /dev/null +++ b/test/test_glob.py @@ -0,0 +1,85 @@ +import posixpath + +from nose.tools import eq_ + +from hdfs.glob import glob +from util import _IntegrationTest + + +class TestGlob(_IntegrationTest): + + def setup(self): + super(TestGlob, self).setup() + self.__build_dirs() + + def __build_dirs(self): + """ + Structure: + + dir_1 + dir_1_1 + file_1_3_1.txt + dir_1_2 + file_1_3_1.txt + dir_1_3 + file_1_3_1.txt + file_1_3_2.txt + file_1_3_3.txt + file_1_1.txt + dir_2 + dir_2_1 + file_2_3_1.txt + dir_2_2 + file_2_3_1.txt + dir_2_3 + file_2_3_1.txt + file_2_3_2.txt + file_2_3_3.txt + file_2_1.txt + """ + self._write(posixpath.join('dir_1', 'dir_1_1', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_2', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_1.txt'), b'file_1_3_1') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_2.txt'), b'file_1_3_2') + self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_3.txt'), b'file_1_3_3') + self._write(posixpath.join('dir_1', 'file_1_1.txt'), b'file_1_1') + self._write(posixpath.join('dir_2', 'dir_2_1', 'file_2_3_1.txt'), b'file_2_3_1') + self._write(posixpath.join('dir_2', 'dir_2_2', 'file_2_2_1.txt'), b'file_2_2_1') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_1.txt'), b'file_2_3_1') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_2.txt'), b'file_2_3_2') + self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_3.txt'), b'file_2_3_3') + self._write(posixpath.join('dir_2', 'file_2_1.txt'), b'file_2_1') + + def test(self): + values = [ + ('./dir_1/dir_1_3/*', [ + './dir_1/dir_1_3/file_1_3_1.txt', + './dir_1/dir_1_3/file_1_3_2.txt', + './dir_1/dir_1_3/file_1_3_3.txt', + ]), + ('./dir_2/dir_2_3/file_2_3_?.txt', [ + './dir_2/dir_2_3/file_2_3_1.txt', + './dir_2/dir_2_3/file_2_3_2.txt', + './dir_2/dir_2_3/file_2_3_3.txt', + ]), + ('*/*.txt', [ + 'dir_1/file_1_1.txt', + 'dir_2/file_2_1.txt', + ]), + ('./dir_[1-2]/file_[1-2]_1.txt', [ + './dir_1/file_1_1.txt', + './dir_2/file_2_1.txt', + ]), + ('./dir_*/dir_*/file_[1-2]_3_2.txt', [ + './dir_1/dir_1_3/file_1_3_2.txt', + './dir_2/dir_2_3/file_2_3_2.txt', + ]), + ('./dir_[3-4]/file_[1-2]_1.txt', []), + ('./dir_*/dir_*/file_[3-4]_3_2.txt', []), + ] + for pattern, expected in values: + actual = glob(self.client, pattern) + eq_(expected, actual, 'Unexpected result for pattern ' + pattern) + + +