Skip to content

Commit 30f0330

Browse files
author
Brian Hulette
committed
Add basic DataFrame impl ...
... and a bunch of performance tests for various scanning approaches
1 parent a1edac2 commit 30f0330

File tree

5 files changed

+262
-29
lines changed

5 files changed

+262
-29
lines changed

js/perf/index.js

+143-28
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,52 @@
1616
// under the License.
1717

1818
// Use the ES5 UMD target as perf baseline
19-
// const { Table, readVectors } = require('../targets/es5/umd');
20-
// const { Table, readVectors } = require('../targets/es5/cjs');
21-
const { Table, readVectors } = require('../targets/es2015/umd');
22-
// const { Table, readVectors } = require('../targets/es2015/cjs');
19+
// const { DataFrame, Table, readVectors } = require('../targets/es5/umd');
20+
// const { DataFrame, Table, readVectors } = require('../targets/es5/cjs');
21+
// const { DataFrame, Table, readVectors } = require('../targets/es2015/umd');
22+
const { DataFrame, Table, readVectors } = require('../targets/es2015/cjs');
2323

2424
const config = require('./config');
2525
const Benchmark = require('benchmark');
2626

2727
const suites = [];
2828

29-
for (let { name, buffers} of config) {
30-
const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true });
31-
const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true });
32-
const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true });
33-
const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true });
34-
parseSuite.add(createFromTableTest(name, buffers));
35-
parseSuite.add(createReadVectorsTest(name, buffers));
36-
for (const vector of Table.from(buffers).columns) {
37-
sliceSuite.add(createSliceTest(vector));
38-
iterateSuite.add(createIterateTest(vector));
39-
getByIndexSuite.add(createGetByIndexTest(vector));
40-
}
41-
suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
42-
}
29+
//for (let { name, buffers} of config) {
30+
// const parseSuite = new Benchmark.Suite(`Parse "${name}"`, { async: true });
31+
// const sliceSuite = new Benchmark.Suite(`Slice "${name}" vectors`, { async: true });
32+
// const iterateSuite = new Benchmark.Suite(`Iterate "${name}" vectors`, { async: true });
33+
// const getByIndexSuite = new Benchmark.Suite(`Get "${name}" values by index`, { async: true });
34+
// parseSuite.add(createFromTableTest(name, buffers));
35+
// parseSuite.add(createReadVectorsTest(name, buffers));
36+
// for (const vector of Table.from(buffers).columns) {
37+
// sliceSuite.add(createSliceTest(vector));
38+
// iterateSuite.add(createIterateTest(vector));
39+
// getByIndexSuite.add(createGetByIndexTest(vector));
40+
// }
41+
// suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
42+
//}
4343

4444
for (let {name, buffers, tests} of require('./table_config')) {
45-
const tableIterateSuite = new Benchmark.Suite(`Table Iterate ${name}`, { async: true });
46-
const tableCountBySuite = new Benchmark.Suite(`Table Count By ${name}`, { async: true });
47-
const vectorCountBySuite = new Benchmark.Suite(`Vector Count By ${name}`, { async: true });
45+
const tableIteratorSuite = new Benchmark.Suite(`Table Iterator "${name}"`, { async: true });
46+
const tableCountSuite = new Benchmark.Suite(`Table Count "${name}"`, { async: true });
47+
const dfIteratorSuite = new Benchmark.Suite(`DataFrame Iterator "${name}"`, { async: true });
48+
const dfIteratorCountSuite = new Benchmark.Suite(`DataFrame Iterator Count "${name}"`, { async: true });
49+
const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true });
50+
const dfScanCountSuite = new Benchmark.Suite(`DataFrame Scan Count "${name}"`, { async: true });
51+
const vectorCountSuite = new Benchmark.Suite(`Vector Count "${name}"`, { async: true });
4852
const table = Table.from(buffers);
4953

50-
tableIterateSuite.add(createTableIterateTest(table));
54+
tableIteratorSuite.add(createTableIteratorTest(table));
55+
dfIteratorSuite.add(createDataFrameIteratorTest(table));
5156
for (test of tests) {
52-
tableCountBySuite.add(createTableCountByTest(table, test.col, test.test, test.value))
53-
vectorCountBySuite.add(createVectorCountByTest(table.columns[test.col], test.test, test.value))
57+
tableCountSuite.add(createTableCountTest(table, test.col, test.test, test.value))
58+
dfIteratorCountSuite.add(createDataFrameIteratorCountTest(table, test.col, test.test, test.value))
59+
dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value))
60+
dfScanCountSuite.add(createDataFrameScanCountTest(table, test.col, test.test, test.value))
61+
vectorCountSuite.add(createVectorCountTest(table.columns[test.col], test.test, test.value))
5462
}
5563

56-
suites.push(tableIterateSuite, tableCountBySuite, vectorCountBySuite)
64+
suites.push(tableIteratorSuite, tableCountSuite, dfIteratorSuite, dfIteratorCountSuite, dfDirectCountSuite, dfScanCountSuite, vectorCountSuite)
5765
}
5866

5967
console.log('Running apache-arrow performance tests...\n');
@@ -125,7 +133,7 @@ function createGetByIndexTest(vector) {
125133
};
126134
}
127135

128-
function createVectorCountByTest(vector, test, value) {
136+
function createVectorCountTest(vector, test, value) {
129137
let op;
130138
if (test == 'gteq') {
131139
op = function () {
@@ -152,7 +160,7 @@ function createVectorCountByTest(vector, test, value) {
152160
};
153161
}
154162

155-
function createTableIterateTest(table) {
163+
function createTableIteratorTest(table) {
156164
let row;
157165
return {
158166
async: true,
@@ -161,7 +169,7 @@ function createTableIterateTest(table) {
161169
};
162170
}
163171

164-
function createTableCountByTest(table, column, test, value) {
172+
function createTableCountTest(table, column, test, value) {
165173
let op;
166174
if (test == 'gteq') {
167175
op = function () {
@@ -187,3 +195,110 @@ function createTableCountByTest(table, column, test, value) {
187195
fn: op
188196
};
189197
}
198+
199+
function createDataFrameIteratorTest(table) {
200+
let df = DataFrame.from(table);
201+
let idx;
202+
return {
203+
async: true,
204+
name: `length: ${table.length}`,
205+
fn() { for (idx of table) {} }
206+
};
207+
}
208+
209+
function createDataFrameDirectCountTest(table, column, test, value) {
210+
let df = DataFrame.from(table);
211+
212+
if (test == 'gteq') {
213+
op = function () {
214+
sum = 0;
215+
for (let batch = -1; ++batch < df.lengths.length;) {
216+
const length = df.lengths[batch];
217+
218+
// load batches
219+
const columns = df.getBatch(batch);
220+
221+
// yield all indices
222+
for (let idx = -1; ++idx < length;) {
223+
sum += (columns[column].get(idx) >= value);
224+
}
225+
}
226+
}
227+
} else if (test == 'eq') {
228+
op = function() {
229+
sum = 0;
230+
for (let batch = -1; ++batch < df.lengths.length;) {
231+
const length = df.lengths[batch];
232+
233+
// load batches
234+
const columns = df.getBatch(batch);
235+
236+
// yield all indices
237+
for (let idx = -1; ++idx < length;) {
238+
sum += (columns[column].get(idx) == value);
239+
}
240+
}
241+
}
242+
} else {
243+
throw new Error(`Unrecognized test "${test}"`);
244+
}
245+
246+
return {
247+
async: true,
248+
name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
249+
fn: op
250+
};
251+
}
252+
253+
function createDataFrameScanCountTest(table, column, test, value) {
254+
let df = DataFrame.from(table);
255+
256+
if (test == 'gteq') {
257+
op = function () {
258+
sum = 0;
259+
df.scan((idx, cols)=>{sum += cols[column].get(idx) >= value});
260+
}
261+
} else if (test == 'eq') {
262+
op = function() {
263+
sum = 0;
264+
df.scan((idx, cols)=>{sum += cols[column].get(idx) == value});
265+
console.log(sum);
266+
}
267+
} else {
268+
throw new Error(`Unrecognized test "${test}"`);
269+
}
270+
271+
return {
272+
async: true,
273+
name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
274+
fn: op
275+
};
276+
}
277+
278+
function createDataFrameIteratorCountTest(table, column, test, value) {
279+
let df = DataFrame.from(table);
280+
281+
if (test == 'gteq') {
282+
op = function () {
283+
sum = 0;
284+
for (idx of df) {
285+
sum += (df.columns[column].get(idx) >= value);
286+
}
287+
}
288+
} else if (test == 'eq') {
289+
op = function() {
290+
sum = 0;
291+
for (idx of df) {
292+
sum += (df.columns[column].get(idx) == value);
293+
}
294+
}
295+
} else {
296+
throw new Error(`Unrecognized test "${test}"`);
297+
}
298+
299+
return {
300+
async: true,
301+
name: `name: '${table.columns[column].name}', length: ${table.length}, type: ${table.columns[column].type}, test: ${test}, value: ${value}`,
302+
fn: op
303+
};
304+
}

js/perf/table_config.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.a
2525
tests = [
2626
{col: 0, test: 'gteq', value: 0 },
2727
{col: 1, test: 'gteq', value: 0 },
28-
{col: 2, test: 'eq', value: 'Seattle'},
28+
//{col: 2, test: 'eq', value: 'Seattle'},
2929
]
3030

3131
for (const filename of filenames) {

js/src/Arrow.ts

+6
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,15 @@ import {
4545
TimestampVector,
4646
} from './vector/numeric';
4747

48+
import { DataFrame } from './dataframe/dataframe';
49+
4850
// closure compiler always erases static method names:
4951
// https://github.com/google/closure-compiler/issues/1776
5052
// set them via string indexers to save them from the mangler
5153
Table['from'] = Table.from;
5254
Table['fromAsync'] = Table.fromAsync;
5355
BoolVector['pack'] = BoolVector.pack;
56+
DataFrame['from'] = DataFrame.from;
5457

5558
export { read, readAsync };
5659
export { Table, Vector, StructRow };
@@ -84,6 +87,8 @@ export {
8487
FixedSizeListVector,
8588
};
8689

90+
export { DataFrame } from './dataframe/dataframe';
91+
8792
/* These exports are needed for the closure umd targets */
8893
try {
8994
const Arrow = eval('exports');
@@ -93,6 +98,7 @@ try {
9398
Arrow['readAsync'] = readAsync;
9499
Arrow['Table'] = Table;
95100
Arrow['Vector'] = Vector;
101+
Arrow['DataFrame'] = DataFrame;
96102
Arrow['StructRow'] = StructRow;
97103
Arrow['BoolVector'] = BoolVector;
98104
Arrow['ListVector'] = ListVector;

js/src/dataframe/dataframe.ts

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import { Vector } from "../vector/vector";
2+
import { StructVector } from "../vector/struct";
3+
import { VirtualVector } from "../vector/virtual";
4+
5+
export abstract class DataFrame {
6+
public abstract columns: Vector<any>[];
7+
public abstract getBatch(batch: number): Vector[];
8+
public abstract scan(next: (idx: number, cols: Vector[])=>void): void;
9+
static from(table: Vector<any>): DataFrame {
10+
// There are two types of Vectors we might want to make into
11+
// a ChunkedDataFrame:
12+
// 1) a StructVector of all VirtualVectors
13+
// 2) a VirtualVector of all StructVectors
14+
if (table instanceof StructVector) {
15+
if (table.columns.every((col) => col instanceof VirtualVector)) {
16+
// ChunkedDataFrame case (1)
17+
return new ChunkedDataFrame(table.columns as VirtualVector<any>[]);
18+
} else {
19+
return new SimpleDataFrame(table.columns)
20+
}
21+
} else if (table instanceof VirtualVector &&
22+
table.vectors.every((vec) => vec instanceof StructVector)) {
23+
const structs = table.vectors as StructVector<any>[];
24+
const rest: StructVector<any>[] = structs.slice(1);
25+
const virtuals: VirtualVector<any>[] = structs[0].columns.map((vec, col_idx) => {
26+
return vec.concat(...rest.map((vec) => vec.columns[col_idx]));
27+
}) as VirtualVector<any>[];
28+
// ChunkedDataFrame case (2)
29+
return new ChunkedDataFrame(virtuals);
30+
} else {
31+
return new SimpleDataFrame([table]);
32+
}
33+
}
34+
}
35+
36+
class SimpleDataFrame extends DataFrame {
37+
readonly lengths: Uint32Array;
38+
constructor(public columns: Vector<any>[]) {
39+
super();
40+
if (!this.columns.slice(1).every((v) => v.length === this.columns[0].length)) {
41+
throw new Error("Attempted to create a DataFrame with un-aligned vectors");
42+
}
43+
this.lengths = new Uint32Array([0, this.columns[0].length]);
44+
}
45+
46+
public getBatch() {
47+
return this.columns;
48+
}
49+
50+
public scan(next: (idx: number, cols: Vector[])=>void) {
51+
for (let idx = -1; ++idx < this.lengths[1];) {
52+
next(idx, this.columns)
53+
}
54+
}
55+
56+
*[Symbol.iterator]() {
57+
for (let idx = -1; ++idx < this.lengths[1];) {
58+
yield idx;
59+
}
60+
}
61+
}
62+
63+
class ChunkedDataFrame extends DataFrame {
64+
public columns: Vector<any>[];
65+
readonly lengths: Uint32Array;
66+
constructor(private virtuals: VirtualVector<any>[]) {
67+
super();
68+
const offsets = virtuals[0].offsets;
69+
if (!this.virtuals.slice(1).every((v) => v.aligned(virtuals[0]))) {
70+
throw new Error("Attempted to create a DataFrame with un-aligned vectors");
71+
}
72+
this.lengths = new Uint32Array(offsets.length);
73+
offsets.forEach((offset, i) => {
74+
this.lengths[i] = offsets[i+1] - offset;;
75+
});
76+
}
77+
78+
getBatch(batch: number): Vector[] {
79+
return this.virtuals.map((virt) => virt.vectors[batch]);
80+
}
81+
82+
scan(next: (idx: number, cols: Vector[])=>void) {
83+
for (let batch = -1; ++batch < this.lengths.length;) {
84+
const length = this.lengths[batch];
85+
86+
// load batches
87+
const columns = this.getBatch(batch);
88+
89+
// yield all indices
90+
for (let idx = -1; ++idx < length;) {
91+
next(idx, columns)
92+
}
93+
}
94+
}
95+
96+
*[Symbol.iterator]() {
97+
for (let batch = -1; ++batch < this.lengths.length;) {
98+
const length = this.lengths[batch];
99+
100+
// load batches
101+
this.columns = this.getBatch(batch);
102+
103+
// yield all indices
104+
for (let idx = -1; ++idx < length;) {
105+
yield idx;
106+
}
107+
}
108+
}
109+
}

js/src/vector/virtual.ts

+3
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ export class VirtualVector<T> implements Vector<T> {
115115
}
116116
return new ArrayType(0);
117117
}
118+
aligned(other: VirtualVector<any>): boolean {
119+
return this.offsets.every((offset, i) => offset === other.offsets[i]);
120+
}
118121
}
119122

120123
function arraySet<T>(source: T[], target: T[], index: number) {

0 commit comments

Comments
 (0)