Re: Поиск и удаление дублирующих файлов
19.11.11 15:07, Michael Shigorin написав(ла):
On Sun, Nov 06, 2011 at 08:31:09PM +0600, Andrey Rahmatullin wrote:
Какие программы есть на эту тему? Желательно, чтоб можно
было посмотреть список файлов-дубляжей и выборочно удалить
любой из них.
Несколько не в тему. Вместо удаления повторов, я обычно
заменяю их одним файлом с несколькими именами (e. g.,
через ln(1)) предварительно проверив совпадение mtime и
содержимого. (И, конечно, то, что это /различные/ файлы.)
apt-get install hardlink
JFYI, ldv@ некоторое время тому рекомендовал вместо него hardlinkpy..
В дебиане будто не наблюдаю -- http://hardlinkpy.googlecode.com/
Пользовался fdupes. Потом переписал на Python (дополнив и ускорив).
#!/bin/env python
# FDUPES Copyright (c) 1999-2002 Adrian Lopez
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of self software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and self permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
__version__ = '1.50-PR2-s1'
import sys, os, stat, optparse, time
try:
from hashlib import md5
except ImportError:
from md5 import md5
CHUNK_SIZE = 8192
PARTIAL_MD5_SIZES = [4096, 16384, 65536, 262144, 1048576]
class filestat:
def __init__( self, path ):
self.filename = path
self.stat = os.stat( path )
self.signatures = [None] * len( PARTIAL_MD5_SIZES )
self.duplicates = None
def getcrcpartial( self, i ):
if self.signatures[i] is None:
f = open( self.filename, 'rb' )
self.signatures[i] = md5( f.read( PARTIAL_MD5_SIZES[i] ) ).digest()
f.close()
if self.stat.st_size <= PARTIAL_MD5_SIZES[i]:
for j in range( i, len( self.signatures ) ):
self.signatures[j] = self.signatures[i]
return self.signatures[i]
# def getcrcsignature( self ):
# if self.signatures[-1] is None:
# self.signatures[-1] = getcrcsignature( self.filename )
# return self.signatures[-1]
# def getcrcsignature( filename ):
# f = open( filename, 'rb' )
# m = md5()
# chunk = f.read( CHUNK_SIZE )
# while chunk:
# m.update( chunk )
# chunk = f.read( CHUNK_SIZE )
# f.close()
# return m.digest()
def registerfile( file ):
file.left = None
file.right = None
return file
def errormsg( message, *args ):
sys.stderr.write( '\r' + ( ' ' * 40 ) + '\r' + sys.argv[0] + ': ' )
print >>sys.stderr, message % args
def escapechar( escape_list, c ):
if c in escape_list:
return '\\' + c
return c
def escapefilename( escape_list, filename ):
return ''.join( map( escapechar, filename ) )
def grokfiles( input ):
input = input[:]
output = []
indicator = '-\\|/'
if not options.hideprogress:
progress = 0
while input:
path = input.pop()
if not options.hideprogress:
if progress % 100 == 0:
sys.stderr.write( '\rBuilding file list %c' %
indicator[progress / 100 % len( indicator )] )
progress += 1
try:
newfile = filestat( path )
if newfile.stat.st_size == 0 and options.excludeempty:
continue
lstat = os.lstat( path )
if stat.S_ISDIR( newfile.stat.st_mode ):
if options.recurse and ( options.followlinks or not stat.S_ISLNK( lstat.st_mode ) ):
try:
for name in os.listdir( path ):
input.append( os.path.join( path, name ) )
except OSError:
errormsg( 'could not chdir to %s', path )
continue
elif stat.S_ISREG( lstat.st_mode ) or ( stat.S_ISLNK( lstat.st_mode ) and options.followlinks ):
output.append( newfile )
except OSError:
pass
return output
def checkmatch( checktable, file ):
if file.stat.st_size not in checktable:
checktable[curfile.stat.st_size] = registerfile( file )
return None
checktree = checktable[curfile.stat.st_size]
while True:
# If device and inode fields are equal one of the files is a
# hard link to the other or the files have been listed twice
# unintentionally. We don't want to flag these files as
# duplicates unless the user specifies otherwise.
if ( file.stat.st_ino == checktree.stat.st_ino and
file.stat.st_dev == checktree.stat.st_dev ):
if options.considerhardlinks:
return checktree
else:
return None
try:
cmpresult = 0
for i in range( len( PARTIAL_MD5_SIZES ) ):
cmpresult = cmp( file.getcrcpartial( i ), checktree.getcrcpartial( i ) )
if cmpresult != 0:
#errormsg( ' on %s vs %s', file.filename, checktree.filename )
break
# if cmpresult == 0:
# cmpresult = cmp( file.getcrcsignature(), checktree.getcrcsignature() )
# #if cmpresult != 0:
# # errormsg( 'P on %s vs %s', file.filename, checktree.filename )
# #else:
# # errormsg( 'P F on %s vs %s', file.filename, checktree.filename )
# # print '%s matches %s' %( file.filename, checktree.filename )
except IOError:
cmpresult = 1
if cmpresult < 0:
if checktree.left is None:
checktree.left = registerfile( file )
return None
else:
checktree = checktree.left
elif cmpresult > 0:
if checktree.right is None:
checktree.right = registerfile( file )
return None
else:
checktree = checktree.right
else:
return checktree
# Do a bit-for-bit comparison in case two different files produce the
# same signature. Unlikely, but better safe than sorry.
def confirmmatch( file1, file2 ):
try:
f1 = open( file1, 'rb' )
f2 = open( file2, 'rb' )
while True:
c1 = f1.read( CHUNK_SIZE )
c2 = f2.read( CHUNK_SIZE )
if c1 != c2:
f1.close()
f2.close()
return False
if not c1:
f1.close()
f2.close()
return True
except IOError:
return False
def formatsize( size ):
if size < 1000:
return '%d bytes' % ( size )
elif size <= 1000000:
return '%.1f kilobytes' % ( size / 1000.0 )
else:
return '%.1f megabytes' % ( size / 1000000.0 )
def summarizematches( groups ):
if not groups:
print 'No duplicates found.'
else:
numfiles = 0
numbytes = 0
for duplicates in groups:
numfiles += len( duplicates ) - 1
numbytes += ( len( duplicates ) - 1 ) * duplicates[0].stat.st_size
print '%d duplicate files (in %d sets), occupying %s' % ( numfiles, len( groups ),
formatsize( numbytes ) )
print
def summarizediskusage( groups ):
if not groups:
print 'No duplicates found.'
else:
numfiles = 0
numbytes = 0
for duplicates in groups:
numfiles += len( duplicates ) - 1
numbytes += ( len( duplicates ) - 1 ) * duplicates[0].stat.st_size
progress = 0
numvfsbytes = 0
for duplicates in groups:
for file in duplicates[1:]:
statvfs = os.statvfs( file.filename )
bsize = statvfs.f_bsize;
numvfsbytes += ( file.stat.st_size + bsize - 1 ) // bsize * bsize
if not options.hideprogress:
sys.stderr.write( '\rCounting space [%d/%d] %.0f%% ' % ( progress, numfiles, 100.0 * progress / numfiles ) )
progress += 1
if not options.hideprogress:
sys.stderr.write( '\r%40s\r' % ' ' )
print '%d duplicate files (in %d sets), occupying %s (%s on disk).' % ( numfiles, len( groups ),
formatsize( numbytes ), formatsize( numvfsbytes ) )
print
def printfilename( filename ):
if options.dsameline:
escapefilename( '\\ ', filename )
sys.stdout.write( escapefilename( '\\ ', filename ) + ' ' )
else:
sys.stdout.write( filename + '\n' )
def printmatches( groups ):
for duplicates in groups:
if options.omitfirst:
duplicates = duplicates[1:]
else:
if options.showsize:
size = duplicates[0].stat.st_size
print size,
if size != 1:
print 'bytes each:'
else:
print 'byte each:'
for file in duplicates:
printfilename( file.filename )
print
def maketemplink( duplicates, file ):
for base in duplicates:
if base != file and file.stat.st_dev == base.stat.st_dev and file.stat.st_ino != base.stat.st_ino:
try:
tmpname = os.tempnam( os.path.dirname( file.filename ), 'fdupes_' )
os.link( base.filename, tmpname )
return tmpname
except OSError:
pass
return None
def relinkfiles( groups ):
for curgroup, duplicates in enumerate( groups ):
for i, file in enumerate( duplicates ):
tmpname = maketemplink( duplicates, file )
if tmpname is None:
print ' [+] %s' % file.filename
else:
try:
os.unlink( file.filename )
except OSError:
print ' [!] %s -- unable to unlink file!' % file.filename
os.unlink( tmp )
continue
try:
os.rename( tmpname, file.filename )
file.stat = os.stat( file.filename )
print ' [@] %s' % file.filename
except OSError:
print ' [!] %s -- unable to rename file!' % file.filename
print
def deletefiles( groups ):
for curgroup, duplicates in enumerate( groups ):
if options.noprompt: # preserve only the first file
delete = list( range( 1, len( duplicates ) ) )
else:
for i, file in enumerate( duplicates ):
print '[%d] %s' % ( i, file.filename )
print
# prompt for files to preserve
while True:
prompt = ( 'Set %d of %d, preserve files [1 - %d, all]' %
( curgroup, len( groups ), len( duplicates ) ) )
if options.showsize:
size = duplicates[0].stat.st_size
prompt += ' (%d ' % size
if size != 1:
prompt += 'bytes each)'
else:
prompt += 'byte each)'
prompt += ': '
line = raw_input( prompt )
delete = list( range( len( duplicates ) ) )
for arg in line.strip().split():
if arg == 'all':
delete = []
break
delete.remove( int( arg ) - 1 )
if len( delete ) < len( duplicates ):
break
print
for i, file in enumerate( duplicates ):
if i not in delete:
print ' [+] %s' % file.filename
else:
try:
os.remove( file.filename )
print ' [-] %s' % file.filename
except OSError:
print ' [!] %s -- unable to delete file!' % file.filename
print
def sort_pairs_by_mtime( f1, f2 ):
return cmp( f1.stat.st_mtime, f2.stat.st_mtime )
if __name__ == '__main__':
parser = optparse.OptionParser(
usage = 'fdupes [options] DIRECTORY...',
version = __version__ )
parser.set_defaults( action = printmatches )
parser.add_option( '-r', '--recurse', action = 'store_true', dest = 'recurse',
help = 'for each directory given after self option follow subdirectories encountered within' )
#parser.add_option( '-R', '--recurse:', action = 'append', dest = 'recurseafter' )
parser.add_option( '-s', '--symlinks', action = 'store_true', dest = 'followlinks',
help = 'follow symlinks' )
parser.add_option( '-H', '--hardlinks', action = 'store_true', dest = 'considerhardlinks',
help = 'normally, when two or more files point to the same disk area they are treated as non-duplicates self option will change self behavior' )
parser.add_option( '-n', '--noempty', action = 'store_true', dest = 'excludeempty',
help = 'exclude zero-length files from consideration' )
parser.add_option( '-f', '--omitfirst', action = 'store_true', dest = 'omitfirst',
help = 'omit the first file in each set of matches' )
parser.add_option( '-1', '--sameline', action = 'store_true', dest = 'dsameline',
help = 'list each set of matches on a single line' )
parser.add_option( '-S', '--size', action = 'store_true', dest = 'showsize',
help = 'show size of duplicate files' )
parser.add_option( '-m', '--summarize', action = 'store_const', dest = 'action', const = summarizematches,
help = 'summarize dupe information' )
parser.add_option( '-M', '--diskusage', action = 'store_const', dest = 'action', const = summarizediskusage,
help = 'summarize disk usage information' )
parser.add_option( '-q', '--quiet', action = 'store_true', dest = 'hideprogress',
help = 'hide progress indicator' )
parser.add_option( '-d', '--delete', action = 'store_const', dest = 'action', const = deletefiles,
help = 'prompt user for files to preserve and delete all others important: under particular circumstances, data may be lost when using self option together with -s or --symlinks, or when specifying a particular directory more than once refer to the fdupes documentation for additional information' )
parser.add_option( '-l', '--relink', action = 'store_const', dest = 'action', const = relinkfiles )
parser.add_option( '-N', '--noprompt', action = 'store_true', dest = 'noprompt',
help = 'together with --delete, preserve the first file in each set of duplicates and delete the rest without prompting the user' )
( options, args ) = parser.parse_args()
options.recurseafter = False
if not args:
errormsg( 'no directories specified' )
sys.exit( 1 )
if options.recurse and options.recurseafter:
errormsg( 'options --recurse and --recurse: are not compatible' )
sys.exit( 1 )
files = grokfiles( args )
if not files:
if not options.hideprogress:
sys.stderr.write( '\r%40s\r' % ' ' )
sys.exit( 0 )
if not options.hideprogress:
filecount = len( files )
progress = 0
checktable = {}
groups = []
for curfile in files:
match = checkmatch( checktable, curfile )
if match:
if confirmmatch( curfile.filename, match.filename ):
if match.duplicates is None:
match.duplicates = [match]
groups.append( match.duplicates )
match.duplicates.append( curfile )
if not options.hideprogress:
sys.stderr.write( '\rProgress [%d/%d] %.0f%% ' %
( progress, filecount, 100.0 * progress / filecount ) )
progress += 1
if not options.hideprogress:
sys.stderr.write( '\r%40s\r' % ' ' )
for duplicates in groups:
duplicates.sort( cmp = sort_pairs_by_mtime )
options.action( groups )
Reply to: