#!/usr/bin/env python3 # 20240515 djb # SPDX-License-Identifier: LicenseRef-PD-hp OR CC0-1.0 OR 0BSD OR MIT-0 OR MIT # Conventional view: # "insert y at position x in array" # takes at most linear #operations, # and usually takes linear #operations. # Consider an "insertion series": # a loop over i of "insert Y[i] at position X[i] in the array". # Usually takes quadratic #operations, right? # The point of the algorithm here: # essentially linear #operations # to apply an insertion series, # even without input-dependent array indexing. # See insertionseries_merge_after_sort_recursive() below. # Simple algorithm. # Could easily be a textbook exercise. # So far searching hasn't found a reference. # Does anyone know a reference? # Algorithm is parallelizable # if one plugs in a parallelizable merging subroutine # and any of the usual parallel prefix-sum algorithms. # Algorithm is vectorizable # if one plugs in vectorizable merging/prefix-sum subroutines. # Good choices of merging subroutines: # fast parallel merging networks (odd-even, bitonic, etc.). # These also naturally avoid input-dependent array indexing. # Algorithm can handle streams as a dynamic data structure. # The s parameter inside the algorithm can be freely selected # as anything between 1 and t-1 # without affecting the output. # Some reasonable choices compatible with parallelism etc.: # Choose s as floor(t/2). # Choose s as ceil(t/2). # Choose s as largest power of 2 with s < t. # Choose s in light of benchmarks of merge etc. # For streams: use s=1, then s=2, then s=4, etc. # Some choices that simplify the algorithm: # Choose s as 1. # Choose s as t-1. # One application of insertion series: # map a sequence of integers to a "constant-weight binary word". # Parameters: nonnegative integers m,t. # Input: t integers X[0],X[1],...,X[t-1] # with 0 <= X[0] <= m, 0 <= X[1] <= m+1, etc. # Output: m+t bits, exactly t being 1, exactly m being 0. # Specifically: output is the result of # starting with m bits, all being 0; # inserting 1 at position X[0]; # inserting 1 at position X[1]; # etc. # Alternatively: do the same for t,m # and exchange 0 with 1. # This is likely faster if t > m. # Variant: constant-weight ternary word. # Generate constant-weight binary word, and # randomly flip signs, so some 1 bits become -1. # Can flip signs as input to insertion series, # or flip signs afterwards. # Another variant: constant number of 1, constant number of -1. # This is another special case of insertion series. # Can flip signs as input to insertion series. # Or just generate binary words # and tweak the last recursive step to flip signs, # choosing s as the target number of 1 positions, # or choosing s as the target number of -1 positions. # Insertion series also generalizes random permutation, random shuffling, etc. # Some applications of constant-weight words: # cryptosystems such as McEliece and NTRU. # Typically want uniform distribution of words. # Uniform distribution of X produces uniform distribution of words. # Typically words are kept secret. # Want to avoid secret-dependent branches and array indices. # The algorithm here easily achieves that. # Typically word will be generated from an "RNG" (or "PRNG", "DRBG", etc.). # Typically use rejection sampling to convert bits to uniform random X. # Typically take larger integers mod range to reduce rejection chance. # Typically replace "mod range" with: multiply by range, then shift. # With large enough integers, rejection chance becomes negligible. # Or can simply skip rejection, accepting non-uniform distribution. # Having X close to uniform produces word close to uniform. # Easy to prove bounds on divergence: # see, e.g., https://cr.yp.to/papers.html#divergence. # This handles "search" applications. # Easy to prove bounds on statistical distance. # This also handles "distinguish" applications. # Typically imposes more stringent closeness requirement. # Typically want words to be generated quickly. # Want to limit amount of RNG output used. # Want to limit amount of computation. # ===== Let's get to the code! # ===== Subroutines # For parallel, vectorizable, etc., # replace merge() with a merging network, # and replace prefixsums() with parallel prefix-sum algorithms. # Can skip last output entry from prefixsums() in context. def merge(L,R): return sorted(L+R) def prefixsums(L): result = [0] for b in L: result += [result[-1]+b] return result # ===== Insertion series # insertionseries API: # input: starting list L (or tuple etc.) # input: list XY of pairs # output: list obtained from L # by inserting y at position x in L # for, successively, each (x,y) in XY # insertionseries_sort API: # input: list XY of pairs # output: list of (x+adjustment,y) # with one entry for each (x,y) from XY # where x+adjustment is where insertionseries ends up putting y # insertionseries implementations: # insertionseries_ref # insertionseries_reduce_to_case_of_empty_list # insertionseries_linearscan_after_sort_ref # insertionseries_merge_after_sort_ref # insertionseries_merge_after_sort_recursive # insertionseries_sort implementations: # insertionseries_sort_ref # insertionseries_sort_recursive # helpers: # insertionseries_sort_merge def insertionseries_ref(L,XY): L = list(L) for x,y in XY: assert 0 <= x and x <= len(L) L = L[:x]+[y]+L[x:] return L def insertionseries_reduce_to_case_of_empty_list(L,XY): XY = list(enumerate(L))+list(XY) return insertionseries_ref([],XY) def insertionseries_sort_ref(XY): L = [] for x,y in XY: L = ( [(u,v) for u,v in L if u= len(result) m -= x-len(result) result += [0]*(x-len(result)) result += [1] assert m >= 0 result += [0]*m return result def cww_sort_mergepos(L,R): L = [(x,1) for x in L] R = [(x-j,0) for j,x in enumerate(R)] M = merge(L,R) offsets = prefixsums(1-fromL for _,fromL in M) return [x+offset for (x,_),offset in zip(M,offsets)] def cww_sort_mergebits(L,R): L = [(x,1) for x in L] R = [(x-j,0) for j,x in enumerate(R)] M = merge(L,R) return [1-fromL for _,fromL in M] def cww_merge_after_sort_ref(m,X): return cww_sort_mergebits(range(m),cww_sort_ref(X)) def cww_sort_recursive(X): X = list(X) t = len(X) if t <= 1: return X s = t//2 L = cww_sort_recursive(X[:s]) R = cww_sort_recursive(X[s:]) return cww_sort_mergepos(L,R) def cww_merge_after_sort_recursive(m,X): return cww_sort_mergebits(range(m),cww_sort_recursive(X)) cww_sort = cww_sort_recursive cww = cww_merge_after_sort_recursive def cww_test(): import sys from random import randrange maxloops = 10000 for t in range(10): for m in range(10): Xpossibilities = 1 for j in range(t): Xpossibilities *= m+t-j tfactorial = 1 for j in range(t): tfactorial *= t-j allinputs = Xpossibilities <= maxloops loops = min(Xpossibilities,maxloops) print(f'cww m {m} t {t} loops {loops} Xpossibilities {Xpossibilities} allinputs {allinputs}') sys.stdout.flush() results = {} for loop in range(loops): r = loop if allinputs else randrange(Xpossibilities) X = [] for j in range(t): X += [r%(m+j+1)] r //= m+j+1 S = cww_sort_ref(X) assert S == cww_sort_via_insertionseries_sort(X) assert S == cww_sort_recursive(X) assert S == cww_sort(X) assert len(S) == t assert S == sorted(set(S)) if t > 0: assert S[0] >= 0 assert S[t-1] < m+t W = cww_ref(m,X) assert W == cww_via_insertionseries(m,X) assert W == cww_linearscan_after_sort_ref(m,X) assert W == cww_merge_after_sort_ref(m,X) assert W == cww_merge_after_sort_recursive(m,X) assert W == cww(m,X) assert len(W) == m+t assert all(y in (0,1) for y in W) assert sum(W) == t assert S == [x for x,y in enumerate(W) if y == 1] if allinputs: S = tuple(S) if S not in results: results[S] = 0 results[S] += 1 if allinputs: assert len(results)*tfactorial == loops for result in results: assert results[result] == tfactorial if __name__ == '__main__': insertionseries_test() cww_test()