diff --git a/Dynamic-Programming/Shuf.js b/Dynamic-Programming/Shuf.js new file mode 100644 index 000000000..14bf66c01 --- /dev/null +++ b/Dynamic-Programming/Shuf.js @@ -0,0 +1,96 @@ +/* +Given a data set of an unknown size, +Get a random sample in a random order +It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file +*/ +function shuf(datasetSource, sampleSize) { + let output = fillBaseSample(datasetSource, sampleSize); + + return randomizeOutputFromDataset(datasetSource, output); +} + +/** + * Fills the output if possible, with the minimum number of values + * @param {Iterable.} datasetSource The iterable source of data + * @param {number} sampleSize The size of the sample to extract from the dataset + * @returns {Array.} The random sample, as an array + * @template T + */ +function fillBaseSample(datasetSource, sampleSize) { + let filledIndexes = []; + let output = new Array(sampleSize); + + // Spread data out filling the array + while (true) { + const iterator = datasetSource.next(); + if (iterator.done) break; + + let insertTo = Math.floor(Math.random() * output.length); + while (filledIndexes.includes(insertTo)) { + insertTo++; + if (insertTo === output.length) { + insertTo = 0; + } + } + output[insertTo] = { + value: iterator.value, + }; + + filledIndexes = [...filledIndexes, insertTo]; + + if (filledIndexes.length === sampleSize) { + break; + } + } + + if (filledIndexes.length < output.length) { + // Not a large enough dataset to fill the sample - trim empty values + output = output.filter((_, i) => filledIndexes.includes(i)); + } + + return output.map((o) => o.value); +} + +/** + * Replaces values in the output randomly with new ones from the dataset + * @param {Iterable.} datasetSource The iterable source of data + * @param {Array.} output The output so far, filled with data + * @returns {Array.} The random sample, as an array + * @template T + */ +function randomizeOutputFromDataset(datasetSource, output) { + const newOutput = [...output]; + let readSoFar = output.length; + + while (true) { + const iterator = datasetSource.next(); + if (iterator.done) break; + readSoFar++; + + const insertTo = Math.floor(Math.random() * readSoFar); + if (insertTo < newOutput.length) { + newOutput[insertTo] = iterator.value; + } + } + + return newOutput; +} + +const main = () => { + /** + * Generates a random range of data, with values between 0 and 2^31 - 1 + * @param {number} length The number of data items to generate + * @returns {Iterable} Random iterable data + */ + function* generateRandomData(length) { + const maxValue = Math.pow(2, 31) - 1; + for (let i = 0; i < length; i++) { + yield Math.floor(Math.random() * maxValue); + } + } + + const source = generateRandomData(1000); + const result = shuf(source, 10); + console.log(result) +} +main()