#!/usr/bin/env bash # shellcheck disable=SC2059 _scriptname="cdupes" set -u set -e # how to use this function printusage { cat - >&2 < [dir] ... Options: -0 Null termination. Allows piping filenames with weird characters into other tools -5 md5sum. Uses md5sum instead of the default sha1sum -A NoHidden. Excludes files which names start with --> . <-- -c Checksum. Show checksum of duplicate files -f Omit first match from each group. Useful with -m for cleanup scripts -i Index. Don't compare files from the same argument with each other Probably not useful unless you specify multiple arguments in which to search -l Hard link support. Do not consider hard linked files duplicates of eachother -m Machine readable. No empty lines between groups Probably only useful in conjunction with -c or -f -n NoEmpty. Ignore empty files (size 0) -p Permissions. Don't consider files with different owner/group or permission bits as duplicates -r Recurse. For every directory given, follow subdirectories encountered within. -S Size. Show size of duplicate files -T Test. Runs a series of self-tests, then exits -q Quiet. Hides progress indicators -Q Quiet errors. Errors will not be printed. Does not hide progress indicators EOF } # read options opt_depth="-maxdepth 1" opt_size="" opt_hidden="" sumtool="sha1sum" quiet=false show_checksum=false show_size=false show_errors=true term='\n' machine_readable=false mind_permissions=false mind_index=false mind_links=false omit_first=false self_test=false OPTIND=1 while getopts ":05AcfhilmnprSTqQ" opt; do case "$opt" in 0) term='\0' ;; 5) sumtool="md5sum" ;; A) opt_hidden=".*" ;; c) show_checksum=true ;; f) omit_first=true ;; h) printusage; exit 0 ;; i) mind_index=true ;; l) mind_links=true ;; m) machine_readable=true ;; n) opt_size="-size +0" ;; p) mind_permissions=true ;; S) show_size=true ;; T) self_test=true ;; r) opt_depth="" ;; q) quiet=true ;; Q) show_errors=false ;; [?]) echo "Unknown option -${OPTARG}" >&2 exit 1 ;; :) echo "Option -${OPTARG} requires an argument" >&2 exit 1 ;; esac done shift $((OPTIND-1)) if [[ "${1:-}" = "--" ]]; then shift; fi # how to print function println { printf '%s\n' "$*"; } function println2 { printf '%s\n' "$*" >&2; } # mind the locale function sort { LC_ALL=C command sort "$@"; } # find the files to consider function list_files { { local code=0 i # shellcheck disable=SC2086 for i in "$@"; do find "$i" \ $opt_depth \ $opt_size \ ! -name "$opt_hidden" \ -type f \ -printf "checksum|${i}|%s|%U-%G-%m|%D-%i|%p\\0" || code=$? done return "$code" } | linecount 'Files' } # filter already sorted lines, printing duplicates function filter_dupes_only { local _checksum="checksum" _index="index" _size="size" _perm="perm" _link="link" local include opts=( '-z' '-t' '|' ) for include; do printf -v "_${include}" '' case "$include" in checksum) opts+=( '-k' '1,1' ) ;; index) opts+=( '-k' '2,2n' ) ;; size) opts+=( '-k' '3,3nr' ) ;; perm) opts+=( '-k' '4,4n' ) ;; links) opts+=( '-k' '5,5n' ) ;; esac done local previous="" previous_cmp="" cmp local checksum index size perm link path while IFS='|' read -r -d $'\0' checksum index size perm link path; do printf -v cmp '%s|%s|%s|%s|%s' \ "${_checksum:-${checksum}}" \ "${_index:-${index}}" \ "${_size:-${size}}" \ "${_perm:-${perm}}" \ "${_link:-${link}}" if [[ "$previous_cmp" == "$cmp" ]]; then if [[ -n "$previous" ]]; then printf '%s\0' "$previous" previous="" fi printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path" else previous_cmp="$cmp" printf -v previous '%s|%s|%s|%s|%s|%s' "$checksum" "$index" "$size" "$perm" "$link" "$path" fi done < <(sort "${opts[@]}") } # count lines - used for progress only function linecount { local line i=0 while IFS='|' read -r -d $'\0' line; do $quiet || progress '\r%s' "$1: $((++i))" printf '%s\0' "$line" done if (( i )); then progress '\n' else $quiet || progress '%s: 0\n' "$1" fi } # calculate checksum for each line function filter_checksum { sort -R -z | { local i=0 code=0 local checksum index size perm link path while IFS='|' read -r -d $'\0' checksum index size perm link path; do checksum="$($sumtool "$path")" || { code=$?; continue; } printf '%s|%s|%s|%s|%s|%s\0' "${checksum%% *}" "$index" "$size" "$perm" "$link" "$path" done return $code } | linecount 'Checksum' | filter_dupes_only checksum } # print output to the user function output { local first=true lastsum="" local checksum index size perm link path while IFS='|' read -r -d $'\0' checksum index size perm link path; do # end progress indicators with newline to stderr $quiet || ! $first || progress '\n' # new group if [[ "$checksum" != "$lastsum" ]]; then $first && first=false || $machine_readable || printf '\n' lastsum="$checksum" ! $omit_first || continue fi # print dupe ! $show_checksum || printf '%s ' "$checksum" ! $show_size || printf '%s ' "$size" printf "%s${term}" "$path" done < <(sort -z -t '|' -k '1,1' -k '6,6') ! $first || return "$file_not_found" # signal to the outer shell that we found nothing } # consider ignoring hard links function filter_hard_links { if ! $mind_links; then cat else sort -z -t '|' -k '5' | { local last="" local checksum index size perm link path while IFS='|' read -r -d $'\0' checksum index size perm link path; do if [[ "$link" != "$last" ]]; then printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path" fi last="$link" done } | linecount 'Unique inodes' fi } # filter to duplicate size/permissions function filter_size_perm { if ! $mind_permissions; then filter_dupes_only size | linecount 'Same size' else filter_dupes_only size perm | linecount 'Same size/perm' fi } # remove checksum dupes with the same index function filter_index { if ! $mind_index; then cat else sort -z -t '|' -k '1,1' -k '2,2n' | { local last="" cmp local checksum index size perm link path while IFS='|' read -r -d $'\0' checksum index size perm link path; do printf -v cmp '%s|%s' "$checksum" "$index" if [[ "$cmp" != "$last" ]]; then printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path" fi last="$cmp" done } | filter_dupes_only checksum | linecount 'Different index' fi } # self-test? if $self_test; then tmp="$(mktemp -d)" trap 'rm -rf "$tmp"' EXIT function clean { cd "$tmp"; find "$tmp" -mindepth 1 -maxdepth 1 -exec rm -rf '{}' \;; } function begin { printf '%s: ' "$*"; cd "$tmp"; } function run { local result sum cd "$tmp" result="$("${BASH_SOURCE[0]}" "$@" 2>&1 | base64 -w0)" sum="$(base64 -d <<<"$result" | sha256sum | awk '{print $1}')" if [[ "$SUM" = "$sum" ]]; then println "OK" else println "Failed!" println println "--- Output ---" base64 -d <<<"$result" println "--- end ---" println println "Checksum: ${sum}" exit 1 fi } println "Testing..." println clean echo "Match" >foo echo "Match" >bar echo "No match" >baz begin 'Basics' SUM="75bbdfc20481bd3ce079367a78db2d7c364be826e8b1a0ee9be51e9daf1f4e82" run . clean mkdir foo mkdir bar echo "Match" >foo/a echo "Match" >foo/b echo "Match" >bar/c begin 'Recursive' SUM="4a33a052c8b1c65c5a14c20eb0b706eb017024d1e5fd1e936445c3f4e50dc067" run -r foo bar clean mkdir foo mkdir bar echo "Match" >foo/a echo "Match" >foo/b echo "Match" >bar/c begin 'Quiet' SUM="70271993fe701d65eed4464160f1b96284db7e914267e6c21e37d9b58ea7386b" run -r -q foo bar begin 'Machine readable 1/3' SUM="8b01b5c2ab744e9a3866fdc402f8e44db3dd7f3fa20466e6dbe24a5ae3ba5a41" run -r -m -q . begin 'Machine readable 2/3' SUM="24752e03d38f07abb04717f2d89df67719ed642c2a867b4b0b54c6147bd5a4f2" run -r -m -q -f . begin 'Machine readable 3/3' SUM="ea2c205b52b4de3737898f0f8994f0fdd69f7ec5eeb6312dcd7052164ae517f8" run -r -m -q -c . clean touch foo bar .baz begin 'Ignore hidden' SUM="cefc8e35e1f038dc5711fce8db88048df467ac54a4671b43c68919c2e1b3e4ee" run -A . begin 'Ignore empty' SUM="6df6686e08a0fdaab53855460f73fcc552f61b9cda62bde43fd33fd1a7b14b64" run -n . begin 'Null termination' SUM="7e7916c52026e5d345670060acacf9a3c0fc010cdf9cf160107f10ea47c17963" run -0 . clean mkdir -m 000 foo bar mkdir baz touch baz/a baz/b begin 'Printing errors' SUM="6c5a9e4b3900a978dfea7950dc258f89c6ad7ca8ec9fe472ac0a4f902b1ac292" run foo bar baz begin 'Quiet errors' SUM="8c309146161a48daa8a1e10d4471c69952f372c3a353ac7d78dbb26421085d3d" run -Q foo bar baz begin 'Very quiet' SUM="8849d6e263be4b3801716c5408edb0e5958602dbd6ee760d7f55fb3b778c6a3f" run -q -Q foo bar baz clean dd if=/dev/zero of=file bs=1M count=4 >/dev/null 2>&1 ln file file2 ln file file3 echo "Match" >foo echo "Match" >bar chmod 456 bar touch baz begin 'Size printing' SUM="90adf35503f3c3532ead2de30429a546acea43872021e8744b17fb400d92a1e9" run -r -q -S . begin 'Size and checksum printing' SUM="9af3b90ae7a9e3f18a51c023503553ea486c951ba3007ee6aa7085edf7139e07" run -r -q -S -c . begin 'Ignored hard links' SUM="ddbe1de8dcd9c8c9b9380bcee198fb54d24d710090861ba75b755239e2a3916c" run -r -q -l . begin 'Ignored permissions differences' SUM="97f27672bc1ade7e5c6049f08ebcce2634c8e15697b285d57826c2e02aed4592" run -r -q -p . clean mkdir foo bar baz touch foo/a foo/b touch bar/a bar/b bar/c bar/d echo "Match" >baz/a echo "Match" >baz/b begin 'Ignore same index 1/2' SUM="7e6a81d240331197b391a91d782e5f7b19279fa23f30d6ed3c3063dd9779d837" run -r -i . begin 'Ignore same index 2/2' SUM="882728756b63ebb74e0c5619613068f86dc00641c5a688468728b175877243f6" run -r -i foo bar baz println println "All tests successful!" exit 0 fi # find where to search declare -a dirs=() re='^[/\.[:alnum:]]' for dir; do [[ "$dir" =~ $re ]] || dir="./$dir" dirs+=( "$dir" ) done # complain if we weren't given anywhere to search if [[ -z "${dirs[*]:-}" ]]; then println "$_scriptname: no directories specified" printusage exit 0 fi # run it! exec 4>&2 { function progress { printf "$@" >&4; } file_not_found=220 # hope none of the programs used here use this exit status... errors=$( set +e -o pipefail # allow errors, and propagate exit status through pipes exec 3>&2 2>&1 1>&3- # swap stdout and stderr list_files "${dirs[@]}" | filter_hard_links | filter_size_perm | filter_checksum | filter_index | output ) && code=$? || code=$? } 2>&1 exec 4>&- # complain about non-zero exit code case "$code" in 0) ;; # *) printf '\n' >&2 ;;& # prints an empty line to stderr if $code is not 0 "$file_not_found") $quiet || printf '\n%s\n' "No duplicate files found!" >&2 ;; *) ! $show_errors || printf '\n%s\n' "Duplicate search exited with error status: $code" >&2 ;; esac # complain about errors if $show_errors && [[ -n "${errors:-}" ]]; then $quiet || ! (( code )) || printf '\n' >&2 printf 'ERRORS:\n%s\n' "$errors" >&2 fi