IT&&LINUX&&Config: Finding same file size in a directory

young@ubuntu-16:~/test$ cat find_same_size.sh
#!/usr/bin/env bash
#set -x
#This is small script can find same size of files.
find_same_size(){

if [[ -z $1 || ! -d $1 ]]
then
echo "Usage $0 directory_name" ;
exit $?
else
dir_name=$1;
echo "current directory is $1"

for i in $(find $dir_name -type f); do
ls -fl $i
done | awk '{f=""
if(NF>9)for(i=9;i<=NF;i++)f=f?f" "$i:$i; else f=$9;
if(a[$5]){ a[$5]=a[$5]"\n"f; b[$5]++;} else a[$5]=f} END{for(x in b)print a[x] }' | xargs stat -c "%s %n" #For just list files
fi
}

find_same_size $1

#Example usage
young@ubuntu-16:~/test$ bash find_same_size.sh tttt/
current directory is tttt/
26 tttt/iss
26 tttt/issue2
1654 tttt/pass
1654 tttt/passwd

#We can delete duplicated files by using like below.
#First delete line that starts with alphabet. Only find the line that begins with numeric. and
#then with md5sum command if the files are really same or not
young@ubuntu-16:~/test$ bash find_same_size.sh tttt/ | awk '{ if($1 !~ /^([[:alpha:]])+/) print $2}' | xargs md5sum
1e7672cbc2f76c3e9daad0e290e711b9 tttt/iss
1e7672cbc2f76c3e9daad0e290e711b9 tttt/issue2
cc44dec6bfd51d296c89d55e7c38b933 tttt/pass
cc44dec6bfd51d296c89d55e7c38b933 tttt/passwd

#-w32 means 32bytes; -d deletes duplicated ones that created most recently among same md5sums. Finally with xargs we can do rm -vf
young@ubuntu-16:~/test$ bash find_same_size.sh tttt/ | awk '{ if($1 !~ /^([[:alpha:]])+/) print $2}' | xargs md5sum | uniq -w32 -d | xargs rm -vf
removed 'tttt/iss'
removed 'tttt/pass'

IT&&LINUX&&Config

Monday, August 29, 2016

Finding same file size in a directory

No comments:

Post a Comment