Skip to content

Commit d2b18d5

Browse files
committed
Merge remote-tracking branch 'ccri/table-scan-perf' into js-cpp-refactor-merge_with-table-scan-perf
2 parents f3f3b86 + e20decd commit d2b18d5

20 files changed

+1513
-379
lines changed

js/gulp/uglify-task.js

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ const reservePublicNames = ((ESKeywords) => function reservePublicNames(target,
9191
`../${src}/table.js`,
9292
`../${src}/vector.js`,
9393
`../${src}/util/int.js`,
94+
`../${src}/recordbatch.js`,
9495
`../${src}/${mainExport}.js`,
9596
];
9697
return publicModulePaths.reduce((keywords, publicModulePath) => [

js/gulp/util.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ const ESKeywords = [
8787
// EventTarget
8888
`addListener`, `removeListener`, `addEventListener`, `removeEventListener`,
8989
// Arrow properties
90-
`low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`,
90+
`low`, `high`, `data`, `index`, `field`, `columns`, 'numCols', 'numRows', `values`, `valueOffsets`, `nullBitmap`, `subarray`
9191
];
9292

9393
function taskName(target, format) {

js/package.json

+7-7
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"clean": "gulp clean",
1313
"debug": "gulp debug",
1414
"perf": "node ./perf/index.js",
15+
"create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow",
1516
"release": "./npm-release.sh",
1617
"clean:all": "run-p clean clean:testdata",
1718
"clean:testdata": "gulp clean:testdata",
@@ -51,16 +52,15 @@
5152
],
5253
"dependencies": {
5354
"@types/text-encoding-utf-8": "1.0.1",
54-
"command-line-args": "5.0.0",
55+
"command-line-args": "5.0.1",
5556
"command-line-usage": "4.1.0",
5657
"flatbuffers": "trxcllnt/flatbuffers-esm",
5758
"json-bignum": "0.0.3",
5859
"text-encoding-utf-8": "^1.0.2",
59-
"ts-node": "4.1.0",
60-
"tslib": "1.8.1"
60+
"tslib": "1.9.0"
6161
},
6262
"devDependencies": {
63-
"@std/esm": "0.19.6",
63+
"@std/esm": "0.19.7",
6464
"@types/flatbuffers": "1.6.5",
6565
"@types/glob": "5.0.34",
6666
"@types/jest": "22.0.1",
@@ -80,11 +80,11 @@
8080
"gulp-transform-js-ast": "1.0.2",
8181
"gulp-typescript": "3.2.3",
8282
"ix": "2.3.4",
83-
"jest": "22.1.2",
83+
"jest": "22.1.3",
8484
"jest-environment-node-debug": "2.0.0",
8585
"json": "9.0.6",
8686
"lerna": "2.7.1",
87-
"lint-staged": "6.0.0",
87+
"lint-staged": "6.0.1",
8888
"merge2": "1.2.1",
8989
"mkdirp": "0.5.1",
9090
"npm-run-all": "4.1.2",
@@ -130,7 +130,7 @@
130130
"lcov"
131131
],
132132
"coveragePathIgnorePatterns": [
133-
"format\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$",
133+
"fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$",
134134
"test\\/.*\\.(ts|tsx|js)$",
135135
"/node_modules/"
136136
],

js/perf/index.js

+131-30
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,40 @@
1616
// under the License.
1717

1818
// Use the ES5 UMD target as perf baseline
19-
// const { Table, readVectors } = require('../targets/es5/umd');
20-
// const { Table, readVectors } = require('../targets/es5/cjs');
21-
const { Table, readVectors } = require('../targets/es2015/umd');
22-
// const { Table, readVectors } = require('../targets/es2015/cjs');
19+
// const { col, Table, read: readBatches } = require('../targets/es5/umd');
20+
// const { col, Table, read: readBatches } = require('../targets/es5/cjs');
21+
// const { col, Table, read: readBatches } = require('../targets/es2015/umd');
22+
const { col, Table, read: readBatches } = require('../targets/es2015/cjs');
2323

24-
const config = require('./config');
2524
const Benchmark = require('benchmark');
2625

2726
const suites = [];
2827

29-
for (let { name, buffers} of config) {
30-
const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true });
31-
const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true });
32-
const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true });
33-
const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true });
34-
parseSuite.add(createFromTableTest(name, buffers));
35-
parseSuite.add(createReadVectorsTest(name, buffers));
36-
for (const vector of Table.from(buffers).columns) {
37-
sliceSuite.add(createSliceTest(vector));
38-
iterateSuite.add(createIterateTest(vector));
39-
getByIndexSuite.add(createGetByIndexTest(vector));
40-
}
41-
suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite);
28+
for (let { name, buffers } of require('./table_config')) {
29+
const parseSuiteName = `Parse "${name}"`;
30+
const sliceSuiteName = `Slice "${name}" vectors`;
31+
const iterateSuiteName = `Iterate "${name}" vectors`;
32+
const getByIndexSuiteName = `Get "${name}" values by index`;
33+
const sliceToArraySuiteName = `Slice toArray "${name}" vectors`;
34+
suites.push(createTestSuite(parseSuiteName, createFromTableTest(name, buffers)));
35+
suites.push(createTestSuite(parseSuiteName, createReadBatchesTest(name, buffers)));
36+
const table = Table.from(buffers);
37+
suites.push(...table.columns.map((vector, i) => createTestSuite(getByIndexSuiteName, createGetByIndexTest(vector, table.schema.fields[i].name))));
38+
suites.push(...table.columns.map((vector, i) => createTestSuite(iterateSuiteName, createIterateTest(vector, table.schema.fields[i].name))));
39+
suites.push(...table.columns.map((vector, i) => createTestSuite(sliceToArraySuiteName, createSliceToArrayTest(vector, table.schema.fields[i].name))));
40+
suites.push(...table.columns.map((vector, i) => createTestSuite(sliceSuiteName, createSliceTest(vector, table.schema.fields[i].name))));
41+
}
42+
43+
for (let {name, buffers, countBys, counts} of require('./table_config')) {
44+
const table = Table.from(buffers);
45+
46+
const dfCountBySuiteName = `DataFrame Count By "${name}"`;
47+
const dfFilterCountSuiteName = `DataFrame Filter-Scan Count "${name}"`;
48+
const dfDirectCountSuiteName = `DataFrame Direct Count "${name}"`;
49+
50+
suites.push(...countBys.map((countBy) => createTestSuite(dfCountBySuiteName, createDataFrameCountByTest(table, countBy))));
51+
suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfFilterCountSuiteName, createDataFrameFilterCountTest(table, col, test, value))));
52+
suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfDirectCountSuiteName, createDataFrameDirectCountTest(table, col, test, value))));
4253
}
4354

4455
console.log('Running apache-arrow performance tests...\n');
@@ -52,7 +63,7 @@ function run() {
5263
var str = x.toString();
5364
var meanMsPerOp = Math.round(x.stats.mean * 100000)/100;
5465
var sliceOf60FPS = Math.round((meanMsPerOp / (1000/60)) * 100000)/1000;
55-
return `${str} (avg: ${meanMsPerOp}ms, or ${sliceOf60FPS}% of a frame @ 60FPS) ${x.suffix || ''}`;
66+
return `${str}\n avg: ${meanMsPerOp}ms\n ${sliceOf60FPS}% of a frame @ 60FPS ${x.suffix || ''}`;
5667
}).join('\n') + '\n');
5768
if (suites.length > 0) {
5869
setTimeout(run, 1000);
@@ -61,51 +72,141 @@ function run() {
6172
.run({ async: true });
6273
}
6374

75+
function createTestSuite(name, test) {
76+
return new Benchmark.Suite(name, { async: true }).add(test);
77+
}
78+
6479
function createFromTableTest(name, buffers) {
6580
let table;
6681
return {
6782
async: true,
68-
name: `Table.from`,
83+
name: `Table.from\n`,
6984
fn() { table = Table.from(buffers); }
7085
};
7186
}
7287

73-
function createReadVectorsTest(name, buffers) {
74-
let vectors;
88+
function createReadBatchesTest(name, buffers) {
89+
let recordBatch;
7590
return {
7691
async: true,
77-
name: `readVectors`,
78-
fn() { for (vectors of readVectors(buffers)) {} }
92+
name: `readBatches\n`,
93+
fn() { for (recordBatch of readBatches(buffers)) {} }
7994
};
8095
}
8196

82-
function createSliceTest(vector) {
97+
function createSliceTest(vector, name) {
8398
let xs;
8499
return {
85100
async: true,
86-
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
101+
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
87102
fn() { xs = vector.slice(); }
88103
};
89104
}
90105

91-
function createIterateTest(vector) {
106+
function createSliceToArrayTest(vector, name) {
107+
let xs;
108+
return {
109+
async: true,
110+
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
111+
fn() { xs = vector.slice().toArray(); }
112+
};
113+
}
114+
115+
function createIterateTest(vector, name) {
92116
let value;
93117
return {
94118
async: true,
95-
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
119+
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
96120
fn() { for (value of vector) {} }
97121
};
98122
}
99123

100-
function createGetByIndexTest(vector) {
124+
function createGetByIndexTest(vector, name) {
101125
let value;
102126
return {
103127
async: true,
104-
name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`,
128+
name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`,
105129
fn() {
106130
for (let i = -1, n = vector.length; ++i < n;) {
107131
value = vector.get(i);
108132
}
109133
}
110134
};
111135
}
136+
137+
function createDataFrameDirectCountTest(table, column, test, value) {
138+
let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column);
139+
140+
if (test == 'gteq') {
141+
op = function () {
142+
sum = 0;
143+
let batches = table.batches;
144+
let numBatches = batches.length;
145+
for (let batchIndex = -1; ++batchIndex < numBatches;) {
146+
// load batches
147+
const { numRows, columns } = batches[batchIndex];
148+
const vector = columns[colidx];
149+
// yield all indices
150+
for (let index = -1; ++index < numRows;) {
151+
sum += (vector.get(index) >= value);
152+
}
153+
}
154+
}
155+
} else if (test == 'eq') {
156+
op = function() {
157+
sum = 0;
158+
let batches = table.batches;
159+
let numBatches = batches.length;
160+
for (let batchIndex = -1; ++batchIndex < numBatches;) {
161+
// load batches
162+
const { numRows, columns } = batches[batchIndex];
163+
const vector = columns[colidx];
164+
// yield all indices
165+
for (let index = -1; ++index < numRows;) {
166+
sum += (vector.get(index) === value);
167+
}
168+
}
169+
}
170+
} else {
171+
throw new Error(`Unrecognized test "${test}"`);
172+
}
173+
174+
return {
175+
async: true,
176+
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`,
177+
fn: op
178+
};
179+
}
180+
181+
function createDataFrameCountByTest(table, column) {
182+
let colidx = table.schema.fields.findIndex((c)=> c.name === column);
183+
184+
return {
185+
async: true,
186+
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}\n`,
187+
fn() {
188+
table.countBy(column);
189+
}
190+
};
191+
}
192+
193+
function createDataFrameFilterCountTest(table, column, test, value) {
194+
let colidx = table.schema.fields.findIndex((c)=> c.name === column);
195+
let df;
196+
197+
if (test == 'gteq') {
198+
df = table.filter(col(column).gteq(value));
199+
} else if (test == 'eq') {
200+
df = table.filter(col(column).eq(value));
201+
} else {
202+
throw new Error(`Unrecognized test "${test}"`);
203+
}
204+
205+
return {
206+
async: true,
207+
name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`,
208+
fn() {
209+
df.count();
210+
}
211+
};
212+
}

js/perf/table_config.js

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
const fs = require('fs');
19+
const path = require('path');
20+
const glob = require('glob');
21+
22+
const config = [];
23+
const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`));
24+
25+
countBys = {
26+
"tracks": ['origin', 'destination']
27+
}
28+
counts = {
29+
"tracks": [
30+
{col: 'lat', test: 'gteq', value: 0 },
31+
{col: 'lng', test: 'gteq', value: 0 },
32+
{col: 'origin', test: 'eq', value: 'Seattle'},
33+
]
34+
}
35+
36+
for (const filename of filenames) {
37+
const { name } = path.parse(filename);
38+
if (name in counts) {
39+
config.push({
40+
name,
41+
buffers: [fs.readFileSync(filename)],
42+
countBys: countBys[name],
43+
counts: counts[name],
44+
});
45+
}
46+
}
47+
48+
module.exports = config;

0 commit comments

Comments
 (0)