Files
q-tools/files/FolderSplit.py

297 lines
8.4 KiB
Python
Executable File

#!/usr/bin/env python3
import math
import os
import re
import shutil
import sys
from datetime import datetime
from random import shuffle
VERSION = "0.3"
def setup_options():
"""Setup the command line options"""
from argparse import ArgumentParser
parser = ArgumentParser(description="Splits files to subfolders equally.")
parser.add_argument(
"--order",
"-o",
type=str,
action="store",
dest="order",
default="sequence",
help="Splitting method.",
choices=["sequence", "sparse", "regexp", "random", "date"],
)
parser.add_argument(
"-m",
action="store_true",
dest="move",
default=False,
help="Move entries instead of hardlink.",
)
parser.add_argument(
"-f",
action="store_true",
dest="files",
default=False,
help="Split files only, skipping folders",
)
parser.add_argument(
"--dry",
action="store_true",
dest="dry",
default=False,
help="Dry run",
)
parser.add_argument(
"-r",
"--regexp",
type=str,
action="store",
dest="regexp",
default="",
help="Regular expression for splitting. When set, order regexp used, -n or -i not used.",
)
parser.add_argument(
"-d",
"--datefmt",
type=str,
action="store",
dest="datefmt",
default=None,
help="Date format for 'date' split. Defaults to %%Y-%%m-%%d",
)
parser.add_argument(
"-n",
"-N",
type=int,
default=None,
action="store",
dest="n",
help="Number of subfolders to split into. Default 10.",
)
parser.add_argument(
"-i",
"-I",
type=int,
default=None,
action="store",
dest="i",
help="Max number of files in one folder. Can not be used together with -n or -r",
)
parser.add_argument(
"path",
type=str,
action="store",
default=".",
nargs="?",
help="Folder to split. Defaults to current folder.",
)
options = parser.parse_args()
if options.regexp != "":
options.order = "regexp"
if options.datefmt is not None:
options.order = "date"
if options.order in ("sequence", "sparse", "random"):
if options.n is None and options.i is None:
parser.print_help()
parser.error("Either -n or -i must be used")
if not options.i is None and not options.n is None:
parser.print_help()
parser.error("Both -n and -i cannot be used at the same time.")
if options.order == "regexp":
if options.regexp == "":
parser.print_help()
parser.error("-r must be used")
if options.order == "date":
if options.datefmt is None:
options.datefmt = "%Y-%m-%d"
return options
def linktree(src, dst):
"""Recursively link a directory tree using os.link.
Modified from shutil.copytree
"""
names = os.listdir(src)
os.makedirs(dst)
errors = []
for name in names:
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.isdir(srcname):
linktree(srcname, dstname)
else:
# Will raise a SpecialFileError for unsupported file types
os.link(srcname, dstname)
except Error as err:
errors.extend(err.args[0])
except EnvironmentError as why:
errors.append((srcname, dstname, str(why)))
if errors:
raise Error(errors)
def copyfileorfolder(basename, source, target, move):
"""Copies a file or folder structure under target folder"""
if os.path.exists(os.path.join(target, basename)):
print("FileExists!: " + os.path.join(target, basename))
return
if move:
shutil.move(os.path.join(source, basename), os.path.join(target, basename))
return
if os.path.isfile(os.path.join(source, basename)):
os.link(os.path.join(source, basename), os.path.join(target, basename))
return
if os.path.isdir(os.path.join(source, basename)):
linktree(os.path.join(source, basename), os.path.join(target, basename))
return
raise RuntimeError(source + " was neither file nor folder.")
def portorder(inFiles, inFolder, outFolders, N, link):
"""Copy files in port order (sparse)"""
outidx = 0
for row in inFiles:
copyfileorfolder(row, inFolder, outFolders[outidx], link)
outidx += 1
if outidx + 1 > N:
outidx = 0
def fileorder(inFiles, inFolder, outFolders, N, link):
"""Copy files in input file order (sequnce)"""
bins = [int(math.floor(float(len(inFiles)) / float(N)))] * int(N)
binidx = 0
while sum(bins) < len(inFiles):
bins[binidx] += 1
binidx += 1
offsets = list(offset(bins))
offsets.insert(0, 0)
for outidx in range(N):
for f in range(offsets[outidx], offsets[outidx] + bins[outidx]):
copyfileorfolder(inFiles[f], inFolder, outFolders[outidx], link)
def regexorder(inFiles, inFolder, outFolders, matcher, uniqlabel, link):
"""Copy files by regex match"""
for f in inFiles:
m = matcher.search(f)
if m:
outidx = uniqlabel.index(m.group(1))
copyfileorfolder(f, inFolder, outFolders[outidx], link)
def dateorder(inFiles, inFolder, outFolders, move):
"""Copy files by regex match"""
for f, d in zip(inFiles, outFolders):
copyfileorfolder(f, inFolder, d, move)
def regexmatches(inFiles, opts):
matcher = re.compile(opts.regexp)
matches = []
skipped = 0
for f in inFiles:
m = matcher.search(f)
if m:
matches.append(m.group(1))
else:
skipped += 1
uniqlabel = sorted(set(matches))
print("Unique matches", uniqlabel)
print("Not matching %d files." % skipped)
outFolders = []
for x in uniqlabel:
outFolders.append(os.path.join(opts.path, x))
return (outFolders, uniqlabel, matcher)
def datematches(inFiles, opts):
matches = []
for f in inFiles:
matches.append(datetime.fromtimestamp(os.path.getmtime(os.path.join(opts.path, f))).strftime(opts.datefmt))
outFolders = []
for x in matches:
outFolders.append(os.path.join(opts.path, x))
print("Unique dates", len(set(matches)))
return outFolders
def offset(it):
total = 0
for x in it:
total += x
yield total
def report(outFolders):
for x in outFolders:
n = len(os.listdir(x))
print(os.path.basename(x) + ":" + str(n))
def main():
"""Splits a folder input in N outputs"""
options = setup_options()
method = options.order.lower().strip()
# list files, and remove hidden (.files)
inFiles = sorted(filter(lambda x: not x.startswith("."), os.listdir(options.path)))
if options.files:
inFiles = [f for f in inFiles if os.path.isfile(os.path.join(options.path, f))]
if method == "regexp":
(outFolders, uniqlabel, matcher) = regexmatches(inFiles, options)
elif method == "date":
outFolders = datematches(inFiles, options)
else:
if options.n:
n = options.n
i = math.ceil(len(inFiles) / n)
else:
n = math.ceil(len(inFiles) / options.i)
i = options.i
print("Splitting to {} folders, <={} files / folder".format(n, i))
outFolders = []
padding = "{:0" + str(len(str(n))) + "d}"
for x in range(n):
outFolders.append(os.path.join(options.path, ("folder-" + padding).format(x + 1)))
if options.dry:
print("Not doing anything, --dry")
return
for x in outFolders:
if not os.path.isdir(x):
os.mkdir(x)
if method == "regexp":
regexorder(inFiles, options.path, outFolders, matcher, uniqlabel, options.move)
if method == "random":
shuffle(inFiles)
portorder(inFiles, options.path, outFolders, n, options.move)
if method == "sparse":
portorder(inFiles, options.path, outFolders, n, options.move)
if method == "sequence":
fileorder(inFiles, options.path, outFolders, n, options.move)
if method == "date":
dateorder(inFiles, options.path, outFolders, options.move)
report(outFolders)
if __name__ == "__main__":
main()