Monday, August 29, 2016

Finding same file size in a directory

young@ubuntu-16:~/test$ cat
#!/usr/bin/env bash
#set -x
#This is small script can find same size of files.

if [[ -z $1 || ! -d $1 ]]
 echo "Usage $0 directory_name" ;
 exit $?
echo "current directory is $1"

for i in $(find $dir_name -type f); do
  ls -fl $i
done | awk '{f=""
        if(NF>9)for(i=9;i<=NF;i++)f=f?f" "$i:$i; else f=$9;
        if(a[$5]){ a[$5]=a[$5]"\n"f; b[$5]++;} else a[$5]=f} END{for(x in b)print a[x] }' | xargs stat -c "%s  %n" #For just list files

find_same_size $1

#Example usage
young@ubuntu-16:~/test$ bash tttt/
current directory is tttt/
26  tttt/iss
26  tttt/issue2
1654  tttt/pass
1654  tttt/passwd

#We can delete duplicated files by using like below.
#First delete line that starts with alphabet. Only find the line that begins with numeric. and
#then with md5sum command if the files are really same or not
young@ubuntu-16:~/test$ bash tttt/ | awk '{ if($1 !~ /^([[:alpha:]])+/) print $2}' | xargs md5sum
1e7672cbc2f76c3e9daad0e290e711b9  tttt/iss
1e7672cbc2f76c3e9daad0e290e711b9  tttt/issue2
cc44dec6bfd51d296c89d55e7c38b933  tttt/pass
cc44dec6bfd51d296c89d55e7c38b933  tttt/passwd

#-w32 means 32bytes; -d deletes duplicated ones that created most recently among same md5sums. Finally with xargs we can do rm -vf
young@ubuntu-16:~/test$ bash tttt/ | awk '{ if($1 !~ /^([[:alpha:]])+/) print $2}' | xargs md5sum | uniq -w32 -d | xargs rm -vf
removed 'tttt/iss'
removed 'tttt/pass'

