Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ TESTS=testcases/largefilesupport.sh \
testcases/verify_deterministic_operation.sh \
testcases/checksum_options.sh \
testcases/md5collisions.sh \
testcases/sha1collisions.sh
testcases/sha1collisions.sh \
testcases/hardlink_groups.sh

AUXFILES=testcases/common_funcs.sh \
testcases/md5collisions/letter_of_rec.ps \
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Rdfind uses the following algorithm. If N is the number of files to search throu
2. For each argument, list the directory contents recursively and assign it to the file list. Assign a directory depth number, starting at 0 for every argument.
3. If the input argument is a file, add it to the file list.
4. Loop over the list, and find out the sizes of all files.
5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. Also see the comment on hardlinks under ”caveats below”!
5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. If flag -rememberidentinode true the removed files are rememberd and included in the final result. Also see the comment on hardlinks under ”caveats below”!
6. Sort files on size. Remove files from the list, which have unique sizes.
7. Sort on device and inode(speeds up file reading). Read a few bytes from the beginning of each file (first bytes).
8. Remove files from list that have the same size but different first bytes.
Expand Down
158 changes: 110 additions & 48 deletions Rdutil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ Rdutil::printtofile(const std::string& filename) const
output << "# Automatically generated\n";
output << "# duptype id depth size device inode priority name\n";

std::vector<Fileinfo>::iterator it;
for (it = m_list.begin(); it != m_list.end(); ++it) {
output << Fileinfo::getduptypestring(*it) << " " << it->getidentity() << " "
<< it->depth() << " " << it->size() << " " << it->device() << " "
<< it->inode() << " " << it->get_cmdline_index() << " " << it->name()
<< '\n';
}
process_result(
[&output](Fileinfo& it) {
output << Fileinfo::getduptypestring(it) << " " << it.getidentity() << " "
<< it.depth() << " " << it.size() << " " << it.device() << " "
<< it.inode() << " " << it.get_cmdline_index() << " " << it.name()
<< '\n';
}
);

output << "# end of file\n";
f1.close();
return 0;
Expand All @@ -61,44 +63,42 @@ Rdutil::printtofile(const std::string& filename) const
// returns how many times the function was invoked.
template<typename Function>
std::size_t
applyactiononfile(std::vector<Fileinfo>& m_list, Function f)
Rdutil::applyactiononfile(Function f) const
{

const auto first = m_list.begin();
const auto last = m_list.end();
auto original = last;

Fileinfo* original = NULL;
std::size_t ntimesapplied = 0;

// loop over files
for (auto it = first; it != last; ++it) {
switch (it->getduptype()) {
case Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE: {
original = it;
assert(original->getidentity() >= 0 &&
"original file should have positive identity");
} break;

case Fileinfo::duptype::DUPTYPE_OUTSIDE_TREE:
// intentional fallthrough
case Fileinfo::duptype::DUPTYPE_WITHIN_SAME_TREE: {
assert(original != last);
// double check that "it" shall be ~linked to "src"
assert(it->getidentity() == -original->getidentity() &&
"it must be connected to src");
// everything is in order. we may now hardlink/symlink/remove it.
if (f(*it, *original)) {
RDDEBUG(__FILE__ ": Failed to apply function f on it.\n");
} else {
++ntimesapplied;
}
} break;
process_result(
[f, original, &ntimesapplied](Fileinfo& it) mutable {
switch (it.getduptype()) {
case Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE: {
original = &it;
assert(original->getidentity() >= 0 &&
"original file should have positive identity");
} break;

case Fileinfo::duptype::DUPTYPE_OUTSIDE_TREE:
// intentional fallthrough
case Fileinfo::duptype::DUPTYPE_WITHIN_SAME_TREE: {
assert(original != NULL);
// double check that "it" shall be ~linked to "src"
assert(it.getidentity() == -original->getidentity() &&
"it must be connected to src");
// everything is in order. we may now hardlink/symlink/remove it.
if (f(it, *original)) {
RDDEBUG(__FILE__ ": Failed to apply function f on it.\n");
} else {
++ntimesapplied;
}
} break;

default:
assert("file with bad duptype at this stage. Programming error!" !=
nullptr);
default:
assert("file with bad duptype at this stage. Programming error!" !=
nullptr);
}
}
}
);

return ntimesapplied;
}

Expand Down Expand Up @@ -140,11 +140,11 @@ Rdutil::deleteduplicates(bool dryrun) const
if (dryrun) {
const bool outputBname = false;
dryrun_helper<outputBname> obj("delete ");
auto ret = applyactiononfile(m_list, obj);
auto ret = applyactiononfile(obj);
std::cout.flush();
return ret;
} else {
return applyactiononfile(m_list, &Fileinfo::static_deletefile);
return applyactiononfile(&Fileinfo::static_deletefile);
}
}

Expand All @@ -154,11 +154,11 @@ Rdutil::makesymlinks(bool dryrun) const
if (dryrun) {
const bool outputBname = true;
dryrun_helper<outputBname> obj("symlink ", " to ");
auto ret = applyactiononfile(m_list, obj);
auto ret = applyactiononfile(obj);
std::cout.flush();
return ret;
} else {
return applyactiononfile(m_list, &Fileinfo::static_makesymlink);
return applyactiononfile(&Fileinfo::static_makesymlink);
}
}

Expand All @@ -168,11 +168,11 @@ Rdutil::makehardlinks(bool dryrun) const
if (dryrun) {
const bool outputBname = true;
dryrun_helper<outputBname> obj("hardlink ", " to ");
const auto ret = applyactiononfile(m_list, obj);
const auto ret = applyactiononfile(obj);
std::cout.flush();
return ret;
} else
return applyactiononfile(m_list, &Fileinfo::static_makehardlink);
return applyactiononfile(&Fileinfo::static_makehardlink);
}

// mark files with a unique number
Expand Down Expand Up @@ -298,7 +298,7 @@ Rdutil::sort_on_depth_and_name(std::size_t index_of_first)
}

std::size_t
Rdutil::removeIdenticalInodes()
Rdutil::removeIdenticalInodes(bool rememberIdenticalInodes)
{
// sort list on device and inode.
auto cmp = cmpDeviceInode;
Expand All @@ -315,6 +315,11 @@ Rdutil::removeIdenticalInodes()
best->setdeleteflag(false);
std::for_each(best + 1, last, [](Fileinfo& f) { f.setdeleteflag(true); });
});

if (rememberIdenticalInodes) {
move_deletes_to_duplist();
}

return cleanup();
}

Expand Down Expand Up @@ -377,14 +382,15 @@ Rdutil::markduplicates()
{
const auto cmp = cmpSizeThenBuffer;
assert(std::is_sorted(m_list.begin(), m_list.end(), cmp));
assert(std::is_sorted(m_identlist.begin(), m_identlist.end(), cmpDeviceInode));

// loop over ranges of adjacent elements
using Iterator = decltype(m_list.begin());
apply_on_range(
m_list.begin(),
m_list.end(),
cmp,
[](const Iterator first, const Iterator last) {
[this](const Iterator first, const Iterator last) {
// size and buffer are equal in [first,last) - all are duplicates!
assert(std::distance(first, last) >= 2);

Expand Down Expand Up @@ -413,7 +419,63 @@ Rdutil::markduplicates()
std::for_each(first + 1, last, marker);
assert(first->getduptype() ==
Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE);

if (m_identlist.size() > 0) {
auto np = m_identlist.end();
m_identindex.push_back(np);
for (auto it = first+1; it < last; it++) {
auto cmp = cmpDeviceInode;
auto bound = std::lower_bound(m_identlist.begin(), m_identlist.end(), *it, cmp);
if (bound != m_identlist.end() && !cmp(*it, *bound)) {
assert(cmp(*it, *bound) == cmp(*bound, *it));
m_identindex.push_back(bound);
} else {
m_identindex.push_back(np);
}
auto range = find_identical_inodes(it);
std::for_each(range.first, range.second, marker);
}
}
});
assert(m_identlist.size() == 0 || m_identindex.size() == m_list.size());
}

std::pair<Rdutil::FileIter, Rdutil::FileIter>
Rdutil::find_identical_inodes(Rdutil::FileIter listpos) const
{
assert(m_identindex.size() != 0);
auto index = listpos - m_list.begin();
auto first = m_identindex[index];
auto last = first;
for (; last < m_identlist.end() && !cmpDeviceInode(*first, *last); last++) {}
return std::pair<Rdutil::FileIter, Rdutil::FileIter>(first, last);
}

template<typename Function>
void
Rdutil::process_result(Function f) const
{
bool with_remembered_nodes = m_identlist.size() > 0;
for (auto it = m_list.begin(); it < m_list.end(); ++it) {
f(*it);
if (with_remembered_nodes) {
auto range = find_identical_inodes(it);
for (auto range_it = range.first; range_it < range.second; ++range_it) {
f(*range_it);
}
}
}
}

void
Rdutil::move_deletes_to_duplist()
{
for (auto it = m_list.begin(); it < m_list.end(); it++) {
if(it->deleteflag()) {
m_identlist.push_back(*it);
}
}
std::sort(m_identlist.begin(), m_identlist.end(), cmpDeviceInode);
}

std::size_t
Expand Down
16 changes: 15 additions & 1 deletion Rdutil.hh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public:
* rank.
* @return number of elements removed
*/
std::size_t removeIdenticalInodes();
std::size_t removeIdenticalInodes(bool rememberIdenticalInodes);

/**
* remove files with unique size from the list.
Expand Down Expand Up @@ -121,6 +121,20 @@ public:

private:
std::vector<Fileinfo>& m_list;

std::vector<Fileinfo> m_identlist;

typedef std::vector<Fileinfo>::iterator FileIter;

std::vector<FileIter> m_identindex;

void move_deletes_to_duplist();

std::pair<FileIter,FileIter> find_identical_inodes(const FileIter listpos) const;

template<typename Function> std::size_t applyactiononfile(Function f) const;

template<typename Function> void process_result(Function f) const;
};

#endif
7 changes: 6 additions & 1 deletion rdfind.1
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ Follow symlinks. Default is false.
.TP
.BR \-removeidentinode " " \fItrue\fR|\fIfalse\fR
Removes items found which have identical inode and device ID. Default
is true.
is true. Consider using -rememberidentinode true instead of -removeidentinode false.
.TP
.BR \-rememberidentinode " " \fItrue\fR|\fIfalse\fR
Removes but remembers items found which have identical inode and device ID and adds
them again in the final result. Runs faster and reports more accurate statistics
than with -removeidentinode false. Implies -removeidentinode true. Default is false.
.TP
.BR \-checksum " " \fImd5\fR|\fIsha1\fR|\fIsha256\fR
What type of checksum to be used: md5, sha1 or sha256. The default is
Expand Down
11 changes: 9 additions & 2 deletions rdfind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ usage()
<< " -followsymlinks true |(false) follow symlinks\n"
<< " -removeidentinode (true)| false ignore files with nonunique "
"device and inode\n"
<< " -rememberidentinode true|(false) ignore files with nonunique device "
"and inode but remember them and include them in the result later. "
"Implies -removeidentinode true\n"
<< " -checksum md5 |(sha1)| sha256\n"
<< " checksum type\n"
<< " -deterministic (true)| false makes results independent of order\n"
Expand Down Expand Up @@ -102,6 +105,7 @@ struct Options
bool followsymlinks = false; // follow symlinks
bool dryrun = false; // only dryrun, dont destroy anything
bool remove_identical_inode = true; // remove files with identical inodes
bool remember_identical_inode = false; // remember files with identical inodes, implies remove_identical_inode
bool usemd5 = false; // use md5 checksum to check for similarity
bool usesha1 = false; // use sha1 checksum to check for similarity
bool usesha256 = false; // use sha256 checksum to check for similarity
Expand Down Expand Up @@ -164,6 +168,8 @@ parseOptions(Parser& parser)
o.dryrun = parser.get_parsed_bool();
} else if (parser.try_parse_bool("-removeidentinode")) {
o.remove_identical_inode = parser.get_parsed_bool();
} else if (parser.try_parse_bool("-rememberidentinode")) {
o.remember_identical_inode = parser.get_parsed_bool();
} else if (parser.try_parse_bool("-deterministic")) {
o.deterministic = parser.get_parsed_bool();
} else if (parser.try_parse_string("-checksum")) {
Expand Down Expand Up @@ -334,9 +340,10 @@ main(int narg, const char* argv[])
// list.
gswd.markitems();

if (o.remove_identical_inode) {
if (o.remove_identical_inode || o.remember_identical_inode) {
// remove files with identical devices and inodes from the list
std::cout << dryruntext << "Removed " << gswd.removeIdenticalInodes()
std::cout << dryruntext << "Removed " << (o.remember_identical_inode ? "(but remembered) " : "")
<< gswd.removeIdenticalInodes(o.remember_identical_inode)
<< " files due to nonunique device and inode." << std::endl;
}

Expand Down
Loading