mirror of
https://github.com/TheAlgorithms/JavaScript.git
synced 2025-07-06 01:18:23 +08:00
Merge pull request #608 from MarkSFrancis/shuf
Added shuf, a way to get a random sample from a large dataset
This commit is contained in:
@ -75,6 +75,7 @@
|
|||||||
* [MinimumCostPath](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/MinimumCostPath.js)
|
* [MinimumCostPath](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/MinimumCostPath.js)
|
||||||
* [NumberOfSubsetEqualToGivenSum](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/NumberOfSubsetEqualToGivenSum.js)
|
* [NumberOfSubsetEqualToGivenSum](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/NumberOfSubsetEqualToGivenSum.js)
|
||||||
* [SieveOfEratosthenes](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SieveOfEratosthenes.js)
|
* [SieveOfEratosthenes](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SieveOfEratosthenes.js)
|
||||||
|
* [Shuf](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/Shuf.js)
|
||||||
* [SudokuSolver](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SudokuSolver.js)
|
* [SudokuSolver](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SudokuSolver.js)
|
||||||
* [TrappingRainWater](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/TrappingRainWater.js)
|
* [TrappingRainWater](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/TrappingRainWater.js)
|
||||||
* [ZeroOneKnapsack](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/ZeroOneKnapsack.js)
|
* [ZeroOneKnapsack](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/ZeroOneKnapsack.js)
|
||||||
|
96
Dynamic-Programming/Shuf.js
Normal file
96
Dynamic-Programming/Shuf.js
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
/*
|
||||||
|
Given a data set of an unknown size,
|
||||||
|
Get a random sample in a random order
|
||||||
|
It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file
|
||||||
|
*/
|
||||||
|
function shuf (datasetSource, sampleSize) {
|
||||||
|
const output = fillBaseSample(datasetSource, sampleSize)
|
||||||
|
|
||||||
|
return randomizeOutputFromDataset(datasetSource, output)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills the output if possible, with the minimum number of values
|
||||||
|
* @param {Iterable.<T>} datasetSource The iterable source of data
|
||||||
|
* @param {number} sampleSize The size of the sample to extract from the dataset
|
||||||
|
* @returns {Array.<T>} The random sample, as an array
|
||||||
|
* @template T
|
||||||
|
*/
|
||||||
|
function fillBaseSample (datasetSource, sampleSize) {
|
||||||
|
let filledIndexes = []
|
||||||
|
let output = new Array(sampleSize)
|
||||||
|
|
||||||
|
// Spread data out filling the array
|
||||||
|
while (true) {
|
||||||
|
const iterator = datasetSource.next()
|
||||||
|
if (iterator.done) break
|
||||||
|
|
||||||
|
let insertTo = Math.floor(Math.random() * output.length)
|
||||||
|
while (filledIndexes.includes(insertTo)) {
|
||||||
|
insertTo++
|
||||||
|
if (insertTo === output.length) {
|
||||||
|
insertTo = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output[insertTo] = {
|
||||||
|
value: iterator.value
|
||||||
|
}
|
||||||
|
|
||||||
|
filledIndexes = [...filledIndexes, insertTo]
|
||||||
|
|
||||||
|
if (filledIndexes.length === sampleSize) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filledIndexes.length < output.length) {
|
||||||
|
// Not a large enough dataset to fill the sample - trim empty values
|
||||||
|
output = output.filter((_, i) => filledIndexes.includes(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
return output.map((o) => o.value)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replaces values in the output randomly with new ones from the dataset
|
||||||
|
* @param {Iterable.<T>} datasetSource The iterable source of data
|
||||||
|
* @param {Array.<T>} output The output so far, filled with data
|
||||||
|
* @returns {Array.<T>} The random sample, as an array
|
||||||
|
* @template T
|
||||||
|
*/
|
||||||
|
function randomizeOutputFromDataset (datasetSource, output) {
|
||||||
|
const newOutput = [...output]
|
||||||
|
let readSoFar = output.length
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const iterator = datasetSource.next()
|
||||||
|
if (iterator.done) break
|
||||||
|
readSoFar++
|
||||||
|
|
||||||
|
const insertTo = Math.floor(Math.random() * readSoFar)
|
||||||
|
if (insertTo < newOutput.length) {
|
||||||
|
newOutput[insertTo] = iterator.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newOutput
|
||||||
|
}
|
||||||
|
|
||||||
|
const main = () => {
|
||||||
|
/**
|
||||||
|
* Generates a random range of data, with values between 0 and 2^31 - 1
|
||||||
|
* @param {number} length The number of data items to generate
|
||||||
|
* @returns {Iterable<number>} Random iterable data
|
||||||
|
*/
|
||||||
|
function * generateRandomData (length) {
|
||||||
|
const maxValue = Math.pow(2, 31) - 1
|
||||||
|
for (let i = 0; i < length; i++) {
|
||||||
|
yield Math.floor(Math.random() * maxValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const source = generateRandomData(1000)
|
||||||
|
const result = shuf(source, 10)
|
||||||
|
console.log(result)
|
||||||
|
}
|
||||||
|
main()
|
Reference in New Issue
Block a user