Skip to content

Commit

Permalink
Merge pull request #127 from nomic-ai/tixrixselection
Browse files Browse the repository at this point in the history
Don't require ix column for selections
  • Loading branch information
bmschmidt committed Jul 19, 2024
2 parents 4ccc8d9 + 074ac1a commit 859ce3d
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 78 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"scripts": {
"dev": "vite --mode dev --port 3344 --host",
"format": "prettier --write src",
"prepare": "npm run build",
"build": "vite build && tsc",
"prepublishOnly": "vite build && tsc && typedoc --skipErrorChecking src/*",
"test": "vite build && npm run test:node",
Expand Down
116 changes: 38 additions & 78 deletions src/selection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
import { Deeptable } from './Deeptable';
import { Scatterplot } from './scatterplot';
import { Tile } from './tile';
import { getTileFromRow } from './tixrixqid';
import type * as DS from './shared.d';
import {
Bool,
DataType,
StructRowProxy,
Type,
Utf8,
Vector,
makeData,
} from 'apache-arrow';
import { bisectLeft, range } from 'd3-array';
import { range } from 'd3-array';
interface SelectParams {
name: string;
useNameCache?: boolean; // If true and a selection with that name already exists, use it and ignore all passed parameters. Otherwise, throw an error.
Expand Down Expand Up @@ -501,14 +500,14 @@ export class DataSelection {
return this;
}

async removePoints(name: string, ixes: bigint[]): Promise<DataSelection> {
return this.add_or_remove_points(name, ixes, 'remove');
async removePoints(name: string, points: StructRowProxy[]): Promise<DataSelection> {
return this.add_or_remove_points(name, points, 'remove');
}

// Non-editable behavior:
// if a single point is added, will also adjust the cursor.
async addPoints(name: string, ixes: bigint[]): Promise<DataSelection> {
return this.add_or_remove_points(name, ixes, 'add');
async addPoints(name: string, points: StructRowProxy[]): Promise<DataSelection> {
return this.add_or_remove_points(name, points, 'add');
}

/**
Expand Down Expand Up @@ -539,38 +538,27 @@ export class DataSelection {
// }

public moveCursorToPoint(
point: StructRowProxy<{ ix: DataType<Type.Int64> }>,
point: StructRowProxy,
) {
// The point contains a field called 'ix', which increases in each tile;
// we use this for moving because it lets us do binary search for relevant tile.
const rowNumber = point[Symbol.for('rowIndex')] as number;
const ix = point.ix as bigint;
if (point.ix === undefined) {
throw new Error(
'Unable to move cursor to point, because it has no `ix` property.',
);
}
const relevantTile = getTileFromRow(point, this.deeptable);

let currentOffset = 0;
let relevantTile: Tile = undefined;
let current_tile_ix = 0;
let positionInTile: number;

let current_tile_ix = 0;
for (const match_length of this.match_count) {
const tile = this.tiles[current_tile_ix];

const ixcol = tile.record_batch.getChild('ix').data[0];
if (ixcol[rowNumber] === ix) {
relevantTile = tile;
if (tile.key === relevantTile.key) {
positionInTile = rowNumber;
break;
}
current_tile_ix += 1;
currentOffset += match_length;
}

if (relevantTile === undefined || positionInTile === undefined) {
return null;
}

const column = relevantTile.record_batch.getChild(
this.name,
) as Vector<Bool>;
Expand All @@ -586,76 +574,48 @@ export class DataSelection {

private async add_or_remove_points(
newName: string,
ixes: bigint[],
points: StructRowProxy[],
which: 'add' | 'remove',
) {
let newCursor = 0;
let tileOfMatch = undefined;
) : Promise<DataSelection>{

const matches : Record<string, number[]>= {};
for (const point of points) {
const t = getTileFromRow(point, this.deeptable);
const rowNum = point[Symbol.for('rowIndex')] as number;
if (!matches[t.key]) {
matches[t.key] = [rowNum];
} else {
matches[t.key].push(rowNum);
}
}

const tileFunction = async (tile: Tile) => {
newCursor = -1;
await this.ready;

// First, get the current version of the tile.
const original = (await tile.get_column(this.name)) as Vector<Bool>;
// Then locate the ix column and look for matches.
const ixcol = tile.record_batch.getChild('ix').data[0]
.values as BigInt64Array;
const mask = Bitmask.from_arrow(original);
for (const ix of ixes) {
// Since ix is ordered, we can do a fast binary search to see if the
// point is there--no need for a full scan.

//@ts-expect-error d3.bisect is not aware it works with bigints as well as numbers
const mid = bisectLeft([...ixcol], ix as unknown as number);
const val = tile.record_batch.get(mid);
// We have to check that there's actually a match,
// because the binary search identifies where it *would* be.
if (val !== null && val.ix === ix) {
// Copy the buffer so we don't overwrite the old one.
// Set the specific value.

// Then if there are matches.
if (matches[tile.key] !== undefined) {
const mask = Bitmask.from_arrow(original);
for (const rowNum of matches[tile.key]) {
if (which === 'add') {
mask.set(mid);
if (ixes.length === 1) {
tileOfMatch = tile.key;
// For single additions, we also move the cursor to the
// newly added point.
// First we see the number of points earlier on the current tile.
let offset_in_tile = 0;
for (let i = 0; i < mid; i++) {
if (mask.get(i)) {
offset_in_tile += 1;
}
}
// Then, we count the number of matches already seen
newCursor = offset_in_tile;
}
mask.set(rowNum);
} else {
// If deleting, we set it to zero.
mask.unset(mid);
}
mask.unset(rowNum);
}
}
return mask.to_arrow();
} else {
return original;
}
return mask.to_arrow();
};

const selection = new DataSelection(this.deeptable, {
name: newName,
tileFunction,
});

selection.on('tile loaded', () => {
// The new cursor gets moved when we encounter a singleton
if (newCursor >= 0) {
selection.cursor = newCursor;
for (let i = 0; i < selection.tiles.length; i++) {
const tile = selection.tiles[i];
if (tile.key === tileOfMatch) {
// Don't add the full number of matches here.
break;
}
selection.cursor += this.match_count[i];
}
}
});
await selection.ready;
for (const tile of this.tiles) {
// This one we actually apply. We'll see if that gets to be slow.
Expand Down
201 changes: 201 additions & 0 deletions src/tixrixqid.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import type { Bool, Data, Field, Struct, StructRowProxy, Vector } from 'apache-arrow';

import type { Tile } from './deepscatter';
import { Bitmask, DataSelection, Deeptable } from './deepscatter';

// The type below indicates that a Qid is not valid if
// there are zero rows selected in the tile.

// A Tix is a tile index, which is an integer identifier for a tile in quadtree.
// It uses the formula (4^z - 1) / 3 + y * 2^z + x, where z is the zoom level,
// and x and y are the tile coordinates.
type Tix = number;

// An Rix is a row index, which is an integer identifier for a row in a tile.
type Rix = number;

// A Rixen is a list of row indices. It must be non-empty.
type Rixen = [Rix, ...Rix[]];

// A Qid is a pair of a Tix and a Rixen. It identifies a set of rows in a tile.
export type Qid = [Tix, Rixen];
export type QidArray = Qid[];

export function zxyToTix(z: number, x: number, y: number) {
return (4 ** z - 1) / 3 + y * 2 ** z + x;
}

function parentTix(tix: number) {
const [z, x, y] = tixToZxy(tix);
return zxyToTix(z - 1, Math.floor(x / 2), Math.floor(y / 2));
}

/**
*
* @param tix The numeric tile index
* @param dataset The deepscatter dataset
* @returns The tile, if it exists.
*
*/
export async function tixToTile(tix: Tix, dataset: Deeptable): Promise<Tile> {
if (tix === 0) {
return dataset.root_tile;
}
if (isNaN(tix)) {
throw new Error('NaN tile index');
}
// We need all parents to exist to find their children. So
// we fetch the tiles here to ensure they've loaded.
const parent = await tixToTile(parentTix(tix), dataset);
//
await parent.populateManifest();
// Now that the parents are loaded, we can find the child.
const [z, x, y] = tixToZxy(tix);
const key = `${z}/${x}/${y}`;
const t = dataset
.map((tile: Tile) => tile)
.filter((tile: Tile) => tile.key === key);
if (t.length) {
return t[0];
}
throw new Error(`Tile ${key} not found in dataset.`);
}

/**
*
* @param qid a quadtree id
* @param dataset
* @returns
*/
export async function qidToRowProxy(qid: Qid, dataset: Deeptable) {
const tile = await tixToTile(qid[0], dataset);
await tile.get_column('x');
return tile.record_batch.get(qid[1][0]);
}

export function tileKey_to_tix(key: string) {
const [z, x, y] = key.split('/').map((d) => parseInt(d));
return zxyToTix(z, x, y);
}

export function tixToZxy(tix: Tix): [number, number, number] {
// This is the inverse function that goes from a quadtree tile's integer identifier 'qix' to the [z, x, y] tuple.

// The z level is the inverse of the qix function.
// Javascript doesn't have base-4 logarithm I guess, so we divide the natural log by the natural log of 4.
const z = Math.floor(Math.log(tix * 3 + 1) / Math.log(4));

// We then get the index inside the tile, which is the offset from the base sequence.
const blockPosition = tix - (4 ** z - 1) / 3;

// Modulo operations turn this into x and y coordinates.
const x = blockPosition % 2 ** z;
const y = Math.floor(blockPosition / 2 ** z);
return [z, x, y];
}

/**
*
* @param row the row returned from a point event, etc.
* @param dataset a deepscatter dataset.
* @returns
*/
export function getQidFromRow(
row: StructRowProxy,
dataset: Deeptable
): [number, number] {
const tile = getTileFromRow(row, dataset);
const rix = row[Symbol.for('rowIndex')] as number;
return [tileKey_to_tix(tile.key), rix] as [number, number];
}

export function getTileFromRow(row: StructRowProxy, dataset: Deeptable): Tile {

const parent = row[Symbol.for('parent')] as Data<Struct>;
const parentsColumns = parent.children;

// Since columns are immutable, we can just compare the memory location of the
// value buffers to find the tile. BUT since columns can be added, we
// need to find the tile that matches the most columns, not assume
// that every column matches exactly.
let best_match: [Tile | null, number] = [null, 0];
const parentNames : [string, Data][] = parent.type.children.map(
(d: Field, i: number) => [d.name, parentsColumns[i]]
);

dataset.map((t: Tile) => {
// @ts-expect-error NOM-1667 expose existence of record batch without generating it.
const batch_exists = t._batch !== undefined;
if (!batch_exists) {
return false;
}
let matching_columns = 0;
for (const [name, column] of parentNames) {
const b = t.record_batch.getChild(name);
if (b !== null) {
if (b.data[0].values === column.values) {
matching_columns++;
}
}
}
if (matching_columns > best_match[1]) {
best_match = [t, matching_columns];
}
});
if (best_match[0] === undefined) {
throw new Error(
'No tiles found for this row.' + JSON.stringify({ ...row })
);
}
return best_match[0];
}

export function getQidArrayFromRows(
rows: StructRowProxy[],
dataset: Deeptable,
): QidArray {
// TODO: this is really inefficient. We should be able to do this in one pass.
const qids = rows.map((row) => getQidFromRow(row, dataset));
const mapped = new Map<number, [number, ...number[]]>();
for (const qid of qids) {
if (mapped.has(qid[0])) {
mapped.get(qid[0]).push(qid[1]);
} else {
mapped.set(qid[0], [qid[1]]);
}
}
return Array.from(mapped.entries());
}

export function selectQixOnTile(tile: Tile, qidList: QidArray) {
const mask = new Bitmask(tile.record_batch.numRows);
const [z, x, y] = tile.key.split('/').map((d) => parseInt(d));
const tix = zxyToTix(z, x, y);
const rixes = qidList
.filter((d) => d[0] === tix)
.map((d) => d[1])
.flat();
for (const rix of rixes) {
mask.set(rix);
}
return mask.to_arrow();
}

/**
*
* @param hoverDatum A struct row.
* @param selection A DataSelection
* @param deeptable A Deepscatter dataset
* @returns
*/
export async function isDatumInSelection(
hoverDatum: StructRowProxy,
selection: DataSelection | null,
deeptable: Deeptable,
): Promise<boolean> {
if (!selection) return false;
const [tix, rix] = getQidFromRow(hoverDatum, deeptable);
const owningTile = await tixToTile(tix, deeptable);
const array = (await owningTile.get_column(selection.name)) as Vector<Bool>;
return !!array.get(rix);
}

0 comments on commit 859ce3d

Please sign in to comment.