#!/usr/bin/env bash
# shellcheck disable=SC2059
_scriptname="cdupes"
set -u
set -e


# how to use this
function printusage
{
cat - >&2 <<EOF

Description:
  Bash script that's functionally similar to the tool "fdupes"

Usage: ${_scriptname} [options] <dir> [dir] ...
Options:
  -0
    Null termination. Allows piping filenames with weird characters into other tools
  -5
    md5sum. Uses md5sum instead of the default sha1sum
  -A
    NoHidden. Excludes files which names start with --> . <--
  -c
    Checksum. Show checksum of duplicate files
  -f
    Omit first match from each group. Useful with -m for cleanup scripts
  -i
    Index. Don't compare files from the same argument with each other
    Probably not useful unless you specify multiple arguments in which to search
  -l
    Hard link support. Do not consider hard linked files duplicates of eachother
  -m
    Machine readable. No empty lines between groups
    Probably only useful in conjunction with -c or -f
  -n
    NoEmpty. Ignore empty files (size 0)
  -p
    Permissions. Don't consider files with different owner/group or permission bits as duplicates
  -r
    Recurse. For every directory given, follow subdirectories encountered within.
  -S
    Size. Show size of duplicate files
  -T
    Test. Runs a series of self-tests, then exits
  -q
    Quiet. Hides progress indicators
  -Q
    Quiet errors. Errors will not be printed. Does not hide progress indicators

EOF
}


# read options
opt_depth="-maxdepth 1"
opt_size=""
opt_hidden=""
sumtool="sha1sum"
quiet=false
show_checksum=false
show_size=false
show_errors=true
term='\n'
machine_readable=false
mind_permissions=false
mind_index=false
mind_links=false
omit_first=false
self_test=false
OPTIND=1
while getopts ":05AcfhilmnprSTqQ" opt; do
	case "$opt" in
		0) term='\0' ;;
		5) sumtool="md5sum" ;;
		A) opt_hidden=".*" ;;
		c) show_checksum=true ;;
		f) omit_first=true ;;
		h) printusage; exit 0 ;;
		i) mind_index=true ;;
		l) mind_links=true ;;
		m) machine_readable=true ;;
		n) opt_size="-size +0" ;;
		p) mind_permissions=true ;;
		S) show_size=true ;;
		T) self_test=true ;;
		r) opt_depth="" ;;
		q) quiet=true ;;
		Q) show_errors=false ;;
		[?])
			echo "Unknown option -${OPTARG}" >&2
			exit 1
		;;
		:)
			echo "Option -${OPTARG} requires an argument" >&2
			exit 1
		;;
	esac
done
shift $((OPTIND-1))
if [[ "${1:-}" = "--" ]]; then shift; fi


# how to print
function println { printf '%s\n' "$*"; }
function println2 { printf '%s\n' "$*" >&2; }


# mind the locale
function sort { LC_ALL=C command sort "$@"; }


# find the files to consider
function list_files
{
	{
		local code=0 i
		# shellcheck disable=SC2086
		for i in "$@"; do
			find "$i" \
				$opt_depth \
				$opt_size \
				! -name "$opt_hidden" \
				-type f \
				-printf "checksum|${i}|%s|%U-%G-%m|%D-%i|%p\\0" || code=$?
		done
		return "$code"
	} | linecount 'Files'
}


# filter already sorted lines, printing duplicates
function filter_dupes_only
{
	local _checksum="checksum" _index="index" _size="size" _perm="perm" _link="link" 
	local include opts=( '-z' '-t' '|' )
	for include; do
		printf -v "_${include}" ''
		case "$include" in
			checksum) opts+=( '-k' '1,1' ) ;;
			index) opts+=( '-k' '2,2n' ) ;;
			size) opts+=( '-k' '3,3nr' ) ;;
			perm) opts+=( '-k' '4,4n' ) ;;
			links) opts+=( '-k' '5,5n' ) ;;
		esac
	done

	local previous="" previous_cmp="" cmp
	local checksum index size perm link path
	while IFS='|' read -r -d $'\0' checksum index size perm link path; do
		printf -v cmp '%s|%s|%s|%s|%s' \
			"${_checksum:-${checksum}}" \
			"${_index:-${index}}" \
			"${_size:-${size}}" \
			"${_perm:-${perm}}" \
			"${_link:-${link}}"
		if [[ "$previous_cmp" == "$cmp" ]]; then
			if [[ -n "$previous" ]]; then
				printf '%s\0' "$previous"
				previous=""
			fi
			printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path"
		else
			previous_cmp="$cmp"
			printf -v previous '%s|%s|%s|%s|%s|%s' "$checksum" "$index" "$size" "$perm" "$link" "$path"
		fi
	done < <(sort "${opts[@]}")
}


# count lines - used for progress only
function linecount
{
	local line i=0
	while IFS='|' read -r -d $'\0' line; do
		$quiet || progress '\r%s' "$1: $((++i))"
		printf '%s\0' "$line"
	done
	if (( i )); then
		progress '\n'
	else
		$quiet || progress '%s: 0\n' "$1"
	fi
}


# calculate checksum for each line
function filter_checksum
{
	sort -R -z |
	{
		local i=0 code=0
		local checksum index size perm link path
		while IFS='|' read -r -d $'\0' checksum index size perm link path; do
			checksum="$($sumtool "$path")" || { code=$?; continue; }
			printf '%s|%s|%s|%s|%s|%s\0' "${checksum%% *}" "$index" "$size" "$perm" "$link" "$path"
		done
		return $code
	} |
	linecount 'Checksum' |
	filter_dupes_only checksum
}


# print output to the user
function output
{
	local first=true lastsum=""
	local checksum index size perm link path
	while IFS='|' read -r -d $'\0' checksum index size perm link path; do
		# end progress indicators with newline to stderr
		$quiet || ! $first || progress '\n'

		# new group
		if [[ "$checksum" != "$lastsum" ]]; then
			$first && first=false || $machine_readable || printf '\n'
			lastsum="$checksum"
			! $omit_first || continue
		fi

		# print dupe
		! $show_checksum || printf '%s ' "$checksum"
		! $show_size || printf '%s ' "$size"
		printf "%s${term}" "$path"
	done < <(sort -z -t '|' -k '1,1' -k '6,6')
	! $first || return "$file_not_found" # signal to the outer shell that we found nothing
}


# consider ignoring hard links
function filter_hard_links
{
	if ! $mind_links; then
		cat
	else
		sort -z -t '|' -k '5' |
		{
			local last=""
			local checksum index size perm link path
			while IFS='|' read -r -d $'\0' checksum index size perm link path; do
				if [[ "$link" != "$last" ]]; then
					 printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path"
				fi
				last="$link"
			done
		} |
		linecount 'Unique inodes'
	fi
}


# filter to duplicate size/permissions
function filter_size_perm
{
	if ! $mind_permissions; then
		filter_dupes_only size | linecount 'Same size'
	else
		filter_dupes_only size perm | linecount 'Same size/perm'
	fi
}


# remove checksum dupes with the same index
function filter_index
{
	if ! $mind_index; then
		cat
	else
		sort -z -t '|' -k '1,1' -k '2,2n' |
		{
			local last="" cmp
			local checksum index size perm link path
			while IFS='|' read -r -d $'\0' checksum index size perm link path; do
				printf -v cmp '%s|%s' "$checksum" "$index"
				if [[ "$cmp" != "$last" ]]; then
					 printf '%s|%s|%s|%s|%s|%s\0' "$checksum" "$index" "$size" "$perm" "$link" "$path"
				fi
				last="$cmp"
			done
		} |
		filter_dupes_only checksum |
		linecount 'Different index'
	fi
}


# self-test?
if $self_test; then
	tmp="$(mktemp -d)"
	trap 'rm -rf "$tmp"' EXIT
	function clean { cd "$tmp"; find "$tmp" -mindepth 1 -maxdepth 1 -exec rm -rf '{}' \;; }
	function begin { printf '%s: ' "$*"; cd "$tmp"; }
	function run
	{
		local result sum
		cd "$tmp"
		result="$("${BASH_SOURCE[0]}" "$@" 2>&1 | base64 -w0)"
		sum="$(base64 -d <<<"$result" | sha256sum | awk '{print $1}')"
		if [[ "$SUM" = "$sum" ]]; then
			println "OK"
		else
			println "Failed!"
			println
			println "--- Output ---"
			base64 -d <<<"$result"
			println "---  end   ---"
			println
			println "Checksum: ${sum}"
			exit 1
		fi
	}
	println "Testing..."
	println

	clean
	echo "Match" >foo
	echo "Match" >bar
	echo "No match" >baz

	begin 'Basics'
	SUM="75bbdfc20481bd3ce079367a78db2d7c364be826e8b1a0ee9be51e9daf1f4e82" run .

	clean
	mkdir foo
	mkdir bar
	echo "Match" >foo/a
	echo "Match" >foo/b
	echo "Match" >bar/c

	begin 'Recursive'
	SUM="4a33a052c8b1c65c5a14c20eb0b706eb017024d1e5fd1e936445c3f4e50dc067" run -r foo bar

	clean
	mkdir foo
	mkdir bar
	echo "Match" >foo/a
	echo "Match" >foo/b
	echo "Match" >bar/c

	begin 'Quiet'
	SUM="70271993fe701d65eed4464160f1b96284db7e914267e6c21e37d9b58ea7386b" run -r -q foo bar

	begin 'Machine readable 1/3'
	SUM="8b01b5c2ab744e9a3866fdc402f8e44db3dd7f3fa20466e6dbe24a5ae3ba5a41" run -r -m -q .
	begin 'Machine readable 2/3'
	SUM="24752e03d38f07abb04717f2d89df67719ed642c2a867b4b0b54c6147bd5a4f2" run -r -m -q -f .
	begin 'Machine readable 3/3'
	SUM="ea2c205b52b4de3737898f0f8994f0fdd69f7ec5eeb6312dcd7052164ae517f8" run -r -m -q -c .

	clean
	touch foo bar .baz

	begin 'Ignore hidden'
	SUM="cefc8e35e1f038dc5711fce8db88048df467ac54a4671b43c68919c2e1b3e4ee" run -A .

	begin 'Ignore empty'
	SUM="6df6686e08a0fdaab53855460f73fcc552f61b9cda62bde43fd33fd1a7b14b64" run -n .

	begin 'Null termination'
	SUM="7e7916c52026e5d345670060acacf9a3c0fc010cdf9cf160107f10ea47c17963" run -0 .

	clean
	mkdir -m 000 foo bar
	mkdir baz
	touch baz/a baz/b

	begin 'Printing errors'
	SUM="6c5a9e4b3900a978dfea7950dc258f89c6ad7ca8ec9fe472ac0a4f902b1ac292" run foo bar baz

	begin 'Quiet errors'
	SUM="8c309146161a48daa8a1e10d4471c69952f372c3a353ac7d78dbb26421085d3d" run -Q foo bar baz

	begin 'Very quiet'
	SUM="8849d6e263be4b3801716c5408edb0e5958602dbd6ee760d7f55fb3b778c6a3f" run -q -Q foo bar baz

	clean
	dd if=/dev/zero of=file bs=1M count=4 >/dev/null 2>&1
	ln file file2
	ln file file3
	echo "Match" >foo
	echo "Match" >bar
	chmod 456 bar
	touch baz

	begin 'Size printing'
	SUM="90adf35503f3c3532ead2de30429a546acea43872021e8744b17fb400d92a1e9" run -r -q -S .

	begin 'Size and checksum printing'
	SUM="9af3b90ae7a9e3f18a51c023503553ea486c951ba3007ee6aa7085edf7139e07" run -r -q -S -c .

	begin 'Ignored hard links'
	SUM="ddbe1de8dcd9c8c9b9380bcee198fb54d24d710090861ba75b755239e2a3916c" run -r -q -l .

	begin 'Ignored permissions differences'
	SUM="97f27672bc1ade7e5c6049f08ebcce2634c8e15697b285d57826c2e02aed4592" run -r -q -p .

	clean
	mkdir foo bar baz
	touch foo/a foo/b
	touch bar/a bar/b bar/c bar/d
	echo "Match" >baz/a
	echo "Match" >baz/b

	begin 'Ignore same index 1/2'
	SUM="7e6a81d240331197b391a91d782e5f7b19279fa23f30d6ed3c3063dd9779d837" run -r -i .

	begin 'Ignore same index 2/2'
	SUM="882728756b63ebb74e0c5619613068f86dc00641c5a688468728b175877243f6" run -r -i foo bar baz

	println
	println "All tests successful!"
	exit 0
fi	


# find where to search
declare -a dirs=()
re='^[/\.[:alnum:]]'
for dir; do
	[[ "$dir" =~ $re ]] || dir="./$dir"
	dirs+=( "$dir" )
done


# complain if we weren't given anywhere to search
if [[ -z "${dirs[*]:-}" ]]; then
	println "$_scriptname: no directories specified"
	printusage
	exit 0
fi


# run it!
exec 4>&2
{
	function progress { printf "$@" >&4; }
	file_not_found=220 # hope none of the programs used here use this exit status...
	errors=$(
		set +e -o pipefail # allow errors, and propagate exit status through pipes
		exec 3>&2 2>&1 1>&3- # swap stdout and stderr
		list_files "${dirs[@]}" |
			filter_hard_links |
			filter_size_perm |
			filter_checksum |
			filter_index |
			output
	) && code=$? || code=$?
} 2>&1
exec 4>&-


# complain about non-zero exit code
case "$code" in
	0) ;;
#	*) printf '\n' >&2 ;;& # prints an empty line to stderr if $code is not 0
	"$file_not_found") $quiet || printf '\n%s\n' "No duplicate files found!" >&2 ;;
	*) ! $show_errors || printf '\n%s\n' "Duplicate search exited with error status: $code" >&2 ;;
esac


# complain about errors
if $show_errors && [[ -n "${errors:-}" ]]; then
	$quiet || ! (( code )) || printf '\n' >&2
	printf 'ERRORS:\n%s\n' "$errors" >&2
fi