/* Given a data set of an unknown size, Get a random sample in a random order It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file */ function shuf(datasetSource, sampleSize) { const output = fillBaseSample(datasetSource, sampleSize) return randomizeOutputFromDataset(datasetSource, output) } /** * Fills the output if possible, with the minimum number of values * @param {Iterable.} datasetSource The iterable source of data * @param {number} sampleSize The size of the sample to extract from the dataset * @returns {Array.} The random sample, as an array * @template T */ function fillBaseSample(datasetSource, sampleSize) { let filledIndexes = [] let output = new Array(sampleSize) // Spread data out filling the array while (true) { const iterator = datasetSource.next() if (iterator.done) break let insertTo = Math.floor(Math.random() * output.length) while (filledIndexes.includes(insertTo)) { insertTo++ if (insertTo === output.length) { insertTo = 0 } } output[insertTo] = { value: iterator.value } filledIndexes = [...filledIndexes, insertTo] if (filledIndexes.length === sampleSize) { break } } if (filledIndexes.length < output.length) { // Not a large enough dataset to fill the sample - trim empty values output = output.filter((_, i) => filledIndexes.includes(i)) } return output.map((o) => o.value) } /** * Replaces values in the output randomly with new ones from the dataset * @param {Iterable.} datasetSource The iterable source of data * @param {Array.} output The output so far, filled with data * @returns {Array.} The random sample, as an array * @template T */ function randomizeOutputFromDataset(datasetSource, output) { const newOutput = [...output] let readSoFar = output.length while (true) { const iterator = datasetSource.next() if (iterator.done) break readSoFar++ const insertTo = Math.floor(Math.random() * readSoFar) if (insertTo < newOutput.length) { newOutput[insertTo] = iterator.value } } return newOutput } // Example /** * Generates a random range of data, with values between 0 and 2^31 - 1 * @param {number} length The number of data items to generate * @returns {Iterable} Random iterable data */ function* generateRandomData(length) { const maxValue = Math.pow(2, 31) - 1 for (let i = 0; i < length; i++) { yield Math.floor(Math.random() * maxValue) } } // const source = generateRandomData(1000) // const result = shuf(source, 10) export { shuf, generateRandomData }