#!/usr/bin/env python3 import os, sys import math, shutil, re from random import shuffle VERSION = "0.2" def setup_options(): """Setup the command line options""" from argparse import ArgumentParser parser = ArgumentParser(description="Splits files to subfolders equally.") parser.add_argument( "--order", "-o", type=str, action="store", dest="order", default="sequence", help="Splitting method.", choices=["sequence", "sparse", "regexp", "random"], ) parser.add_argument( "-m", action="store_true", dest="move", default=False, help="Move entries instead of hardlink.", ) parser.add_argument( "-f", action="store_true", dest="files", default=False, help="Split files only, skipping folders", ) parser.add_argument( "--dry", action="store_true", dest="dry", default=False, help="Dry run", ) parser.add_argument( "-r", "--regexp", type=str, action="store", dest="regexp", default="", help="Regular expression for splitting. When set, order regexp used, -n or -i not used.", ) parser.add_argument( "-n", "-N", type=int, default=None, action="store", dest="n", help="Number of subfolders to split into. Default 10.", ) parser.add_argument( "-i", "-I", type=int, default=None, action="store", dest="i", help="Max number of files in one folder. Can not be used together with -n or -r", ) parser.add_argument( "path", type=str, action="store", default=".", nargs="?", help="Folder to split.", ) options = parser.parse_args() if options.n is None and options.i is None and options.regexp == "": parser.print_help() parser.error("Either -n, -i or -r must be passed") if options.regexp != "": options.order = "regexp" if options.regexp == "": if not options.i is None and not options.n is None: parser.print_help() parser.error("Both -n and -i cannot be used at the same time.") return options def linktree(src, dst): """Recursively link a directory tree using os.link. Modified from shutil.copytree """ names = os.listdir(src) os.makedirs(dst) errors = [] for name in names: srcname = os.path.join(src, name) dstname = os.path.join(dst, name) try: if os.path.isdir(srcname): linktree(srcname, dstname) else: # Will raise a SpecialFileError for unsupported file types os.link(srcname, dstname) except Error as err: errors.extend(err.args[0]) except EnvironmentError as why: errors.append((srcname, dstname, str(why))) if errors: raise Error(errors) def copyfileorfolder(basename, source, target, move): """Copies a file or folder structure under target folder""" if move: shutil.move(os.path.join(source, basename), os.path.join(target, basename)) return if os.path.isfile(os.path.join(source, basename)): os.link(os.path.join(source, basename), os.path.join(target, basename)) return if os.path.isdir(os.path.join(source, basename)): linktree(os.path.join(source, basename), os.path.join(target, basename)) return raise RuntimeError(source + " was neither file nor folder.") def portorder(inFiles, inFolder, outFolders, N, link): """Copy files in port order (sparse)""" outidx = 0 for row in inFiles: copyfileorfolder(row, inFolder, outFolders[outidx], link) outidx += 1 if outidx + 1 > N: outidx = 0 def fileorder(inFiles, inFolder, outFolders, N, link): """Copy files in input file order (sequnce)""" bins = [int(math.floor(float(len(inFiles)) / float(N)))] * int(N) binidx = 0 while sum(bins) < len(inFiles): bins[binidx] += 1 binidx += 1 offsets = list(offset(bins)) offsets.insert(0, 0) for outidx in range(N): for f in range(offsets[outidx], offsets[outidx] + bins[outidx]): copyfileorfolder(inFiles[f], inFolder, outFolders[outidx], link) def regexorder(inFiles, inFolder, outFolders, matcher, uniqlabel, link): """Copy files by regex match""" for f in inFiles: m = matcher.search(f) if m: outidx = uniqlabel.index(m.group(1)) copyfileorfolder(f, inFolder, outFolders[outidx], link) def regexmatches(inFiles, opts): matcher = re.compile(opts.regexp) matches = [] skipped = 0 for f in inFiles: m = matcher.search(f) if m: matches.append(m.group(1)) else: skipped += 1 uniqlabel = sorted(set(matches)) print("Unique matches", uniqlabel) print("Not matching %d files." % skipped) outFolders = [] for x in uniqlabel: outFolders.append(os.path.join(opts.path, x)) return (outFolders, uniqlabel, matcher) def offset(it): total = 0 for x in it: total += x yield total def report(outFolders): for x in outFolders: n = len(os.listdir(x)) print(os.path.basename(x) + ":" + str(n)) def main(): """Splits a folder input in N outputs""" options = setup_options() method = options.order.lower().strip() # list files, and remove hidden (.files) inFiles = sorted(filter(lambda x: not x.startswith("."), os.listdir(options.path))) if options.files: inFiles = [f for f in inFiles if os.path.isfile(os.path.join(options.path, f))] if options.n: n = options.n i = math.ceil(len(inFiles) / n) else: n = math.ceil(len(inFiles) / options.i) i = options.i if method == "regexp": (outFolders, uniqlabel, matcher) = regexmatches(inFiles, options) else: print("Splitting to {} folders, <={} files / folder".format(n, i)) outFolders = [] padding = "{:0" + str(len(str(n))) + "d}" for x in range(n): outFolders.append( os.path.join(options.path, ("folder-" + padding).format(x + 1)) ) if options.dry: print("Not doing anything, --dry") return for x in outFolders: if not os.path.isdir(x): os.mkdir(x) if method == "regexp": regexorder(inFiles, options.path, outFolders, matcher, uniqlabel, options.move) if method == "random": shuffle(inFiles) portorder(inFiles, options.path, outFolders, n, options.move) if method == "sparse": portorder(inFiles, options.path, outFolders, n, options.move) if method == "sequence": fileorder(inFiles, options.path, outFolders, n, options.move) report(outFolders) if __name__ == "__main__": main()