adding first files

This commit is contained in:
q
2012-11-13 09:02:16 +02:00
commit e7f6ee6fc5
2 changed files with 505 additions and 0 deletions

209
file_list.py Executable file
View File

@@ -0,0 +1,209 @@
#!/usr/bin/python
import sys
import os
import re
import sqlite3
import subprocess
import hashlib
import magic
from argparse import ArgumentParser
SQLFILE='list_of_files.sqlite'
IMGMATCH=re.compile('.*\.jpg$|.*\.jpeg$|.*\.png$',re.I)
BADDIRS=[]
MINSIZE=0
MIME=magic.open(magic.MAGIC_NONE)
#MIME=magic.open(magic.MAGIC_MIME)
MIME.load()
def setup_options():
parser=ArgumentParser(description="Maintains the list of images sqlite file")
parser.add_argument("-a",action="store_false",dest="add",default=True,
help="Do not add new files [%(default)s]")
parser.add_argument("-c",action="store_true",dest="changed",default=False,
help="Modify changed files [%(default)s]")
parser.add_argument("-d",action="store_true",dest="delete",default=False,
help="Delete non-existing entries [%(default)s]")
parser.add_argument("-l",action="store_true",dest="symlinks",default=False,
help="Follow symbolic links [%(default)s]")
parser.add_argument("--dup",action="store_true",dest="duplicate",default=False,
help="Return a list of duplicate files, based on hashes. This option will flip the 'Add new files' option. [%(default)s]")
parser.add_argument("-x",action="append",dest="exclude",default=[],
help="Exclude folder name from the lists. This option may be issued several times")
parser.add_argument("-f",action="store",dest="sqlfile",default=SQLFILE,
help="SQL file name to use [%(default)s]")
parser.add_argument("-s",type=str,dest="search",default=False,
help="Search list based on path pattern")
parser.add_argument('startpath', action="store",default='.', nargs='?')
options=parser.parse_args()
BADDIRS.extend(options.exclude)
if options.duplicate:
options.add=not options.add
return options
def createdb(fname):
conn=sqlite3.connect(fname)
db=conn.cursor()
conn.text_factory=str
db.execute('CREATE TABLE list (id INTEGER PRIMARY KEY AUTOINCREMENT,\
file TEXT,date INTEGER, hash TEXT,\
size INTEGER, mime TEXT)')
conn.commit()
return
def delete_nonexisting(sqlfile,options):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
#conn.row_factory=sqlite3.Row
db=conn.cursor()
dbdel=conn.cursor()
db.execute('SELECT file FROM list')
for row in db:
if os.path.exists(row[0]):
delete=False
if not options.symlinks:
if os.path.islink(row[0]):
delete=True
else:
delete=True
if delete:
print('removing.. '+row[0])
dbdel.execute("DELETE FROM list where file == ?",(row[0],))
conn.commit()
return
def add_recurse(options):
conn=sqlite3.connect(options.sqlfile)
conn.text_factory=str
db=conn.cursor()
for path,dirs,files in os.walk(options.startpath,followlinks=options.symlinks):
dirs=clean_dirs(dirs)
if not options.symlinks:
files=clean_syms(files,path)
for file in files:
filename=os.path.abspath(os.path.join(path,file))
if file==options.sqlfile:
continue
if not is_listed(db,filename):
if options.add:
add_single(conn,filename,change=False)
else:
if options.changed:
ftime=os.path.getmtime(filename)
if not ftime_match(db,filename,ftime):
#file content changed
add_single(conn,filename,change=True)
return
def add_single(conn,filename,change=False,hash=None,minsize=0):
db=conn.cursor()
print "%(f)s" % {'f':filename}
if hash==None:
hash=get_md5(filename)
ftime=os.path.getmtime(filename)
fsize=os.path.getsize(filename)
mime=MIME.file(filename)
if change:
db.execute("UPDATE list SET date=?, hash=?, size=?, mime=? \
WHERE file=?",(ftime,hash,fsize,mime,filename))
#print "changing: %(f)s " % {'f':filename}
else:
db.execute("INSERT INTO list(file,date,hash,size,mime)\
VALUES(?,?,?,?,?)",(filename,ftime,hash,fsize,mime))
conn.commit()
return
def is_listed(db,filename):
db.execute("SELECT COUNT(*) FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]>0
def ftime_match(db,filename,ftime):
db.execute("SELECT date FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]==ftime
def hash_match(db,filename,hash):
db.execute("SELECT hash FROM list where file == ?",(filename,))
count=db.fetchall()
return count[0][0]==hash
def get_md5(filename):
''' returns content based hash, only first 50Mb is read '''
return hashlib.md5(open(filename,'rb').read(1024*1024*50)).hexdigest()
def clean_dirs(dirs):
for s in BADDIRS:
if s in dirs:
dirs.remove(s)
return dirs
def clean_syms(files,path):
nonsyms=[]
for f in files:
if not os.path.islink(os.path.join(path,f)):
nonsyms.append(f)
return nonsyms
def find_duplicates(sqlfile):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbh=conn.cursor()
db.execute("SELECT hash,count(*) FROM list group by hash HAVING count(*) > 1 ")
duphash=[]
for row in db:
hash=row[0]
dbh.execute("SELECT file,size,date FROM list WHERE hash = ?",(hash,))
flist=[]
for row in dbh:
flist.append(row)
flist.sort(key=lambda file: file[0])
duphash.append((hash, flist))
duphash.sort(key=lambda file: file[1][0])
return duphash
def searchdb(sqlfile,needle):
conn=sqlite3.connect(sqlfile)
conn.text_factory=str
db=conn.cursor()
dbh=conn.cursor()
db.execute("SELECT file FROM list WHERE file LIKE ? ORDER BY file",('%'+needle+'%',))
for row in db:
print(row[0])
def print_structure(files):
for hash in files:
#print(hash[0])
i=1
for f in hash[1]:
print "%(i)d: %(x)d:%(f)s " % {'i':i, 'f':f[0], 'x':f[1]}
i+=1
return
def main():
options=setup_options();
if not os.path.exists(options.sqlfile):
createdb(options.sqlfile);
if options.search:
searchdb(options.sqlfile,options.search)
sys.exit(0)
if options.delete:
print('Deleting entries...')
delete_nonexisting(options.sqlfile,options)
if options.add or options.changed:
print('Adding '+options.startpath+' entries...')
add_recurse(options)
if options.duplicate:
files=find_duplicates(options.sqlfile)
print_structure(files)
sys.exit(0)
main()