Re: Поиск и удаление дублирующих файлов

To: debian-russian@lists.debian.org
Subject: Re: Поиск и удаление дублирующих файлов
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 19 Nov 2011 16:56:29 +0200
Message-id: <[🔎] ja8g33$aa0$1@dough.gmane.org>
In-reply-to: <[🔎] 20111119130757.GD11276@osdn.org.ua>
References: <[🔎] 20111105101336.6bab8fed@DOM-131> <[🔎] 87zkgaa2wl.fsf@violet.siamics.net> <[🔎] 20111106143109.GG22378@belkar.wrar.name> <[🔎] 20111119130757.GD11276@osdn.org.ua>

19.11.11 15:07, Michael Shigorin написав(ла):

On Sun, Nov 06, 2011 at 08:31:09PM +0600, Andrey Rahmatullin wrote:

Какие программы есть на эту тему?  Желательно, чтоб можно
было посмотреть список файлов-дубляжей и выборочно удалить
любой из них.

Несколько не в тему.  Вместо удаления повторов, я обычно
заменяю их одним файлом с несколькими именами (e. g.,
через ln(1)) предварительно проверив совпадение mtime и
содержимого.  (И, конечно, то, что это /различные/ файлы.)

apt-get install hardlink


JFYI, ldv@ некоторое время тому рекомендовал вместо него hardlinkpy..
В дебиане будто не наблюдаю -- http://hardlinkpy.googlecode.com/


Пользовался fdupes. Потом переписал на Python (дополнив и ускорив).

#!/bin/env python
# FDUPES Copyright (c) 1999-2002 Adrian Lopez
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of self software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and self permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

__version__ = '1.50-PR2-s1'

import sys, os, stat, optparse, time
try:
	from hashlib import md5
except ImportError:
	from md5 import md5


CHUNK_SIZE = 8192

PARTIAL_MD5_SIZES = [4096, 16384, 65536, 262144, 1048576]

class filestat:
	def __init__( self, path ):
		self.filename = path
		self.stat = os.stat( path )
		self.signatures = [None] * len( PARTIAL_MD5_SIZES )
		self.duplicates = None

	def getcrcpartial( self, i ):
		if self.signatures[i] is None:
			f = open( self.filename, 'rb' )
			self.signatures[i] = md5( f.read( PARTIAL_MD5_SIZES[i] ) ).digest()
			f.close()
			if self.stat.st_size <= PARTIAL_MD5_SIZES[i]:
				for j in range( i, len( self.signatures ) ):
					self.signatures[j] = self.signatures[i]
		return self.signatures[i]

# 	def getcrcsignature( self ):
# 		if self.signatures[-1] is None:
# 			self.signatures[-1] = getcrcsignature( self.filename )
# 		return self.signatures[-1]

# def getcrcsignature( filename ):
# 	f = open( filename, 'rb' )
# 	m = md5()
# 	chunk = f.read( CHUNK_SIZE )
# 	while chunk:
# 		m.update( chunk )
# 		chunk = f.read( CHUNK_SIZE )
# 	f.close()
# 	return m.digest()

def registerfile( file ):
	file.left = None
	file.right = None
	return file

def errormsg( message, *args ):
	sys.stderr.write( '\r' + ( ' ' * 40 ) + '\r' + sys.argv[0] + ': ' )
	print >>sys.stderr, message % args


def escapechar( escape_list, c ):
	if c in escape_list:
		return '\\' + c
	return c

def escapefilename( escape_list, filename ):
	return ''.join( map( escapechar, filename ) )


def grokfiles( input ):
	input = input[:]
	output = []
	indicator = '-\\|/'
	if not options.hideprogress:
		progress = 0
	while input:
		path = input.pop()
		if not options.hideprogress:
			if progress % 100 == 0:
				sys.stderr.write( '\rBuilding file list %c' %
					indicator[progress / 100 % len( indicator )] )
			progress += 1
		try:
			newfile = filestat( path )
			if newfile.stat.st_size == 0 and options.excludeempty:
				continue
			lstat = os.lstat( path )
			if stat.S_ISDIR( newfile.stat.st_mode ):
				if options.recurse and ( options.followlinks or not stat.S_ISLNK( lstat.st_mode ) ):
					try:
						for name in os.listdir( path ):
							input.append( os.path.join( path, name ) )
					except OSError:
						errormsg( 'could not chdir to %s', path )
						continue
			elif stat.S_ISREG( lstat.st_mode ) or ( stat.S_ISLNK( lstat.st_mode ) and options.followlinks ):
				output.append( newfile )
		except OSError:
			pass
	return output



def checkmatch( checktable, file ):
	if file.stat.st_size not in checktable:
		checktable[curfile.stat.st_size] = registerfile( file )
		return None
	checktree = checktable[curfile.stat.st_size]
	while True:
		# If device and inode fields are equal one of the files is a
		# hard link to the other or the files have been listed twice
		# unintentionally. We don't want to flag these files as
		# duplicates unless the user specifies otherwise.
		if ( file.stat.st_ino == checktree.stat.st_ino and
				file.stat.st_dev == checktree.stat.st_dev ):
			if options.considerhardlinks:
				return checktree
			else:
				return None

		try:
			cmpresult = 0
			for i in range( len( PARTIAL_MD5_SIZES ) ):
				cmpresult = cmp( file.getcrcpartial( i ), checktree.getcrcpartial( i ) )
				if cmpresult != 0:
					#errormsg( '    on %s vs %s', file.filename, checktree.filename )
					break

# 			if cmpresult == 0:
# 				cmpresult = cmp( file.getcrcsignature(), checktree.getcrcsignature() )
# 				#if cmpresult != 0:
# 				#	errormsg( 'P   on %s vs %s', file.filename, checktree.filename )
# 				#else:
# 				#	errormsg( 'P F on %s vs %s', file.filename, checktree.filename )
# 				#	print '%s matches %s' %( file.filename, checktree.filename )
		except IOError:
			cmpresult = 1

		if cmpresult < 0:
			if checktree.left is None:
				checktree.left = registerfile( file )
				return None
			else:
				checktree = checktree.left
		elif cmpresult > 0:
			if checktree.right is None:
				checktree.right = registerfile( file )
				return None
			else:
				checktree = checktree.right
		else:
			return checktree


# Do a bit-for-bit comparison in case two different files produce the
# same signature. Unlikely, but better safe than sorry.

def confirmmatch( file1, file2 ):
	try:
		f1 = open( file1, 'rb' )
		f2 = open( file2, 'rb' )
		while True:
			c1 = f1.read( CHUNK_SIZE )
			c2 = f2.read( CHUNK_SIZE )
			if c1 != c2:
				f1.close()
				f2.close()
				return False
			if not c1:
				f1.close()
				f2.close()
				return True
	except IOError:
		return False


def formatsize( size ):
		if size < 1000:
			return '%d bytes' % ( size )
		elif size <= 1000000:
			return '%.1f kilobytes' % ( size / 1000.0 )
		else:
			return '%.1f megabytes' % ( size / 1000000.0 )

def summarizematches( groups ):
	if not groups:
		print 'No duplicates found.'
	else:
		numfiles = 0
		numbytes = 0
		for duplicates in groups:
			numfiles += len( duplicates ) - 1
			numbytes += ( len( duplicates ) - 1 ) * duplicates[0].stat.st_size
		print '%d duplicate files (in %d sets), occupying %s' % ( numfiles, len( groups ),
			formatsize( numbytes ) )
	print


def summarizediskusage( groups ):
	if not groups:
		print 'No duplicates found.'
	else:
		numfiles = 0
		numbytes = 0
		for duplicates in groups:
			numfiles += len( duplicates ) - 1
			numbytes += ( len( duplicates ) - 1 ) * duplicates[0].stat.st_size
		progress = 0
		numvfsbytes = 0
		for duplicates in groups:
			for file in duplicates[1:]:
				statvfs = os.statvfs( file.filename )
				bsize = statvfs.f_bsize;
				numvfsbytes += ( file.stat.st_size + bsize - 1 ) // bsize * bsize
				if not options.hideprogress:
					sys.stderr.write( '\rCounting space [%d/%d] %.0f%% ' % ( progress, numfiles, 100.0 * progress / numfiles ) )
					progress += 1
		if not options.hideprogress:
			sys.stderr.write( '\r%40s\r' % ' ' )

		print '%d duplicate files (in %d sets), occupying %s (%s on disk).' % ( numfiles, len( groups ),
			formatsize( numbytes ), formatsize( numvfsbytes ) )
	print


def printfilename( filename ):
	if options.dsameline:
		escapefilename( '\\ ', filename )
		sys.stdout.write( escapefilename( '\\ ', filename ) + ' ' )
	else:
		sys.stdout.write( filename + '\n' )


def printmatches( groups ):
	for duplicates in groups:
			if options.omitfirst:
				duplicates = duplicates[1:]
			else:
				if options.showsize:
					size = duplicates[0].stat.st_size
					print size,
					if size != 1:
						print 'bytes each:'
					else:
						print 'byte each:'
			for file in duplicates:
				printfilename( file.filename )
			print


def maketemplink( duplicates, file ):
	for base in duplicates:
		if base != file and file.stat.st_dev == base.stat.st_dev and file.stat.st_ino != base.stat.st_ino:
			try:
				tmpname = os.tempnam( os.path.dirname( file.filename ), 'fdupes_' )
				os.link( base.filename, tmpname )
				return tmpname
			except OSError:
				pass
	return None

def relinkfiles( groups ):
	for curgroup, duplicates in enumerate( groups ):
		for i, file in enumerate( duplicates ):
			tmpname = maketemplink( duplicates, file )
			if tmpname is None:
				print '   [+] %s' % file.filename
			else:
				try:
					os.unlink( file.filename )
				except OSError:
					print '   [!] %s -- unable to unlink file!' % file.filename
					os.unlink( tmp )
					continue
				try:
					os.rename( tmpname, file.filename )
					file.stat = os.stat( file.filename )
					print '   [@] %s' % file.filename
				except OSError:
					print '   [!] %s -- unable to rename file!' % file.filename
		print


def deletefiles( groups ):
	for curgroup, duplicates in enumerate( groups ):
		if options.noprompt: # preserve only the first file
			delete = list( range( 1, len( duplicates ) ) )
		else:
			for i, file in enumerate( duplicates ):
				print '[%d] %s' % ( i, file.filename )
			print
			# prompt for files to preserve
			while True:
				prompt = ( 'Set %d of %d, preserve files [1 - %d, all]' %
					( curgroup, len( groups ), len( duplicates ) ) )
				if options.showsize:
					size = duplicates[0].stat.st_size
					prompt += ' (%d ' % size
					if size != 1:
						prompt += 'bytes each)'
					else:
						prompt += 'byte each)'
				prompt += ': '

				line = raw_input( prompt )
				delete = list( range( len( duplicates ) ) )
				for arg in line.strip().split():
					if arg == 'all':
						delete = []
						break
					delete.remove( int( arg ) - 1 )
				if len( delete ) < len( duplicates ):
					break
		print
		for i, file in enumerate( duplicates ):
			if i not in delete:
				print '   [+] %s' % file.filename
			else:
				try:
					os.remove( file.filename )
					print '   [-] %s' % file.filename
				except OSError:
					print '   [!] %s -- unable to delete file!' % file.filename
		print


def sort_pairs_by_mtime( f1, f2 ):
	return cmp( f1.stat.st_mtime, f2.stat.st_mtime )


if __name__ == '__main__':
	parser = optparse.OptionParser(
		usage = 'fdupes [options] DIRECTORY...',
		version = __version__ )
	parser.set_defaults( action = printmatches )
	parser.add_option( '-r', '--recurse', action = 'store_true', dest = 'recurse',
		help = 'for each directory given after self option follow subdirectories encountered within' )
	#parser.add_option( '-R', '--recurse:', action = 'append', dest = 'recurseafter' )
	parser.add_option( '-s', '--symlinks', action = 'store_true', dest = 'followlinks',
		help = 'follow symlinks' )
	parser.add_option( '-H', '--hardlinks', action = 'store_true', dest = 'considerhardlinks',
		help = 'normally, when two or more files point to the same disk area they are treated as non-duplicates self option will change self behavior' )
	parser.add_option( '-n', '--noempty', action = 'store_true', dest = 'excludeempty',
		help = 'exclude zero-length files from consideration' )
	parser.add_option( '-f', '--omitfirst', action = 'store_true', dest = 'omitfirst',
		help = 'omit the first file in each set of matches' )
	parser.add_option( '-1', '--sameline', action = 'store_true', dest = 'dsameline',
		help = 'list each set of matches on a single line' )
	parser.add_option( '-S', '--size', action = 'store_true', dest = 'showsize',
		help = 'show size of duplicate files' )
	parser.add_option( '-m', '--summarize', action = 'store_const', dest = 'action', const = summarizematches,
		help = 'summarize dupe information' )
	parser.add_option( '-M', '--diskusage', action = 'store_const', dest = 'action', const = summarizediskusage,
		help = 'summarize disk usage information' )
	parser.add_option( '-q', '--quiet', action = 'store_true', dest = 'hideprogress',
		help = 'hide progress indicator' )
	parser.add_option( '-d', '--delete', action = 'store_const', dest = 'action', const = deletefiles,
		help = 'prompt user for files to preserve and delete all others important: under particular circumstances, data may be lost when using self option together with -s or --symlinks, or when specifying a particular directory more than once refer to the fdupes documentation for additional information' )
	parser.add_option( '-l', '--relink', action = 'store_const', dest = 'action', const = relinkfiles )
	parser.add_option( '-N', '--noprompt', action = 'store_true', dest = 'noprompt',
		help = 'together with --delete, preserve the first file in each set of duplicates and delete the rest without prompting the user' )

	( options, args ) = parser.parse_args()
	options.recurseafter = False

	if not args:
		errormsg( 'no directories specified' )
		sys.exit( 1 )

	if options.recurse and options.recurseafter:
		errormsg( 'options --recurse and --recurse: are not compatible' )
		sys.exit( 1 )

	files = grokfiles( args )

	if not files:
		if not options.hideprogress:
			sys.stderr.write( '\r%40s\r' % ' ' )
			sys.exit( 0 )


	if not options.hideprogress:
		filecount = len( files )
		progress = 0
	checktable = {}
	groups = []
	for curfile in files:
		match = checkmatch( checktable, curfile )
		if match:
			if confirmmatch( curfile.filename, match.filename ):
				if match.duplicates is None:
					match.duplicates = [match]
					groups.append( match.duplicates )
				match.duplicates.append( curfile )
		if not options.hideprogress:
			sys.stderr.write( '\rProgress [%d/%d] %.0f%% ' %
				( progress, filecount, 100.0 * progress / filecount ) )
			progress += 1
	if not options.hideprogress:
		sys.stderr.write( '\r%40s\r' % ' ' )

	for duplicates in groups:
		duplicates.sort( cmp = sort_pairs_by_mtime )
	options.action( groups )

Reply to:

References:
- Поиск и удаление дублирующих файлов
  - From: Нагашибай Жанибек <njm.janik@yandex.ru>
- Re: Поиск и удаление дублирующих файлов
  - From: Ivan Shmakov <oneingray@gmail.com>
- Re: Поиск и удаление дублирующих файлов
  - From: Andrey Rahmatullin <wrar@wrar.name>
- Re: Поиск и удаление дублирующих файлов
  - From: Michael Shigorin <mike@osdn.org.ua>

Prev by Date: Re: Поиск и удаление дублирующих файлов
Next by Date: Bibtex не умеет unicode. Что делать?
Previous by thread: Re: Поиск и удаление дублирующих файлов
Next by thread: Как поставить 32-битную систему на Core i3?
Index(es):
- Date
- Thread