buttoned up hash and ast

This commit is contained in:
Chris Kruining 2025-03-13 10:41:22 +01:00
parent b1e617e74a
commit e88d727d8e
No known key found for this signature in database
GPG key ID: EB894A3560CCCAD2
5 changed files with 261 additions and 255 deletions

View file

@ -0,0 +1,38 @@
import { describe, expect } from "vitest";
import { it } from "~/test-helpers";
import { createElement, splitBy, mergeNodes } from './ast';
describe('ast', () => {
describe('createElement', () => {
it('should ____', () => {
// Arrange
// Act
// Assert
expect(true).toEqual(true);
});
});
describe('splitBy', () => {
it('should ____', () => {
// Arrange
// Act
// Assert
expect(true).toEqual(true);
});
});
describe('mergeNodes', () => {
it('should ____', () => {
// Arrange
// Act
// Assert
expect(true).toEqual(true);
});
});
});

View file

@ -1,7 +1,7 @@
import type { Node, Text, Parent, RootContent } from 'hast'; import type { Node, Text, Parent, RootContent } from 'hast';
import { find } from 'unist-util-find'; import { find } from 'unist-util-find';
import { visit } from 'unist-util-visit'; import { visit } from 'unist-util-visit';
import { hash } from './temp'; import { hash } from './hash';
export const createElement = (tagName: string, children: any[], properties: object = {}) => ({ type: 'element', tagName, children, properties }); export const createElement = (tagName: string, children: any[], properties: object = {}) => ({ type: 'element', tagName, children, properties });
@ -14,7 +14,6 @@ export const splitBy = (tree: Parent, splitPoints: SplitPoint[]): RootContent[][
const result: RootContent[][] = []; const result: RootContent[][] = [];
let remaining: RootContent[] = Object.hasOwn(tree, 'children') ? (tree as Parent).children : []; let remaining: RootContent[] = Object.hasOwn(tree, 'children') ? (tree as Parent).children : [];
console.log('kaas');
// console.log(Object.groupBy(splitPoints, p => hash(p.node))); // console.log(Object.groupBy(splitPoints, p => hash(p.node)));
for (const { node, offset } of splitPoints) { for (const { node, offset } of splitPoints) {

View file

@ -0,0 +1,54 @@
import { describe, expect } from "vitest";
import { it } from "~/test-helpers";
import { hash } from "./hash";
const DEFAULT_DATA = {
prop_object: {
is: 'some prop',
},
prop_boolean: false,
prop_bigint: 1_000_000_000_000n,
prop_null: null,
prop_undefined: undefined,
prop_function: () => { },
prop_symbol: Symbol('symbol'),
uint8array: new Uint8Array([0xff, 0x00, 0xff, 0x00]),
uint32array: new Uint32Array([0xff00ff00]),
};
describe('hash', () => {
it('should hash a value with sha-1 algorithm', () => {
// Arrange
const expected = '6fe383b712ec74177f7714a3f5db5416accef8b';
// Act
const actual = hash(DEFAULT_DATA);
// Assert
expect(actual).toEqual(expected);
});
it('should be stable over multiple runs', () => {
// Arrange
// Act
const run1 = hash(DEFAULT_DATA);
const run2 = hash(DEFAULT_DATA);
// Assert
expect(run1).toEqual(run2);
});
// I can't seem to actually create a dataset that is large enough in order to test this.
// So, for now, I will consider this unreachable code.
it('should error if the input is too large', () => {
// Arrange
// Act
// Assert
expect(true).toEqual(true);
});
});

168
src/features/editor/hash.ts Normal file
View file

@ -0,0 +1,168 @@
import { installIntoGlobal } from "iterator-helpers-polyfill";
installIntoGlobal();
const CHUNK_SIZE = 16;
const UINT32_BYTE_SIZE = 4;
const HASH_NUMBER_OF_UINT32 = 5;
const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE;
const initalizationVector /* 20 bytes */ = Object.freeze([0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const);
const hashKey /* 16 bytes */ = Object.freeze([0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const);
type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number'
type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 };
export const hash = (data: any) => {
const buffer = typeof data === 'object' && data instanceof Uint32Array ? data : new Uint32Array(toBinary(data));
if (!Number.isSafeInteger(buffer.length)) {
throw new Error('Cannot hash more than 2^53 - 1 bits');
}
// prepare blocks
const output = new Uint32Array(initalizationVector) as HashBytes;
const blocks = range(0, buffer.length, CHUNK_SIZE).map(i => {
const view = buffer.subarray(i, i + 16);
const words = Array<Word>(80);
words[0] = view[0];
words[1] = view[1];
words[2] = view[2];
words[3] = view[3];
words[4] = view[4];
return words;
});
// apply blocks
for (const words of blocks) {
let [a, b, c, d, e] = output;
for (let i = 0; i < 80; i++) {
if (i >= 16) {
words[i] = circularShiftLeft(1, words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16]);
}
const tmp = (
circularShiftLeft(a, HASH_NUMBER_OF_UINT32) +
logicalHashFunctions(i, b, c, d) +
e +
words[i] +
hashKey[Math.floor(i / HASH_SIZE)]
);
e = d;
d = c;
c = circularShiftLeft(b, 30);
b = a;
a = tmp;
}
output[0] = (output[0] + a) | 0;
output[1] = (output[1] + b) | 0;
output[2] = (output[2] + c) | 0;
output[3] = (output[3] + d) | 0;
output[4] = (output[4] + e) | 0;
}
return output.values().map(word => (word >>> 0).toString(16)).join('');
};
const circularShiftLeft = (subject: number, offset: number): number => {
return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF);
};
const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => {
if (index < HASH_SIZE) {
return (b & c) | (~b & d);
}
else if (index < (2 * HASH_SIZE)) {
return b ^ c ^ d;
}
else if (index < (3 * HASH_SIZE)) {
return (b & c) | (b & d) | (c & d);
}
else if (index < (4 * HASH_SIZE)) {
return b ^ c ^ d;
}
throw new Error('Unreachable code');
};
const range = function* (start: number, end: number, step: number): Iterator<number> {
for (let i = start; i <= end; i += step) {
yield i;
}
};
const toBinary = function*<T>(data: T): Generator<number, void, unknown> {
switch (typeof data) {
case 'function':
case 'symbol':
case 'undefined':
break;
case 'string':
yield* compact(new TextEncoder().encode(data));
break;
case 'number':
yield data;
break;
case 'boolean':
yield Number(data);
break;
case 'bigint':
let value: bigint = data;
// limit the iteration to 10 cycles.
// This covers 10*32 bits, which in al honesty should be enough no?
const ITERATION_LIMIT = 10;
for (let i = 0; i < ITERATION_LIMIT && value > 0; i++) {
yield Number((value & 0xffffffffn));
value >>= 32n;
if (i === 10) {
throw new Error('Iteration limit in bigint serialization reached');
}
}
break;
case 'object':
if (data === null) {
break;
}
if (data instanceof Uint8Array) {
yield* compact(data);
}
if (data instanceof Uint32Array) {
yield* data;
}
for (const item of Object.values(data)) {
yield* toBinary(item);
}
break;
}
};
const compact = function* (source: Iterable<number>): Generator<number, void, unknown> {
let i = 0;
let buffer = 0;
for (const value of source) {
buffer |= (value & 0xff) << (8 * i);
if (i === 3) {
yield buffer;
buffer = 0;
}
i = (i + 1) % 4;
}
};

View file

@ -1,253 +0,0 @@
const bit = {
get(subject: number, index: number) {
return Boolean((subject >> index) & 1);
},
set(subject: number, index: number, value?: boolean) {
if (value !== undefined) {
return this.clear(subject, index) | ((value ? 1 : 0) << index);
}
return subject | (1 << index)
},
clear(subject: number, index: number) {
return subject & ~(1 << index);
},
toggle(subject: number, index: number) {
return subject ^ (1 << index);
},
};
interface BitArray {
[index: number]: boolean;
length: number;
}
const ITEM_BIT_SIZE = 64;
const createBitArray = (data: boolean[] = []) => {
const store: number[] = [];
const populated: number[] = [];
let length = 0;
const parseIndex = (key: string) => {
const value = Number.parseInt(key);
if (Number.isNaN(value) || !Number.isFinite(value)) {
return undefined;
}
return value;
};
const convert = (index: number) => [
Math.floor(index / ITEM_BIT_SIZE),
index % ITEM_BIT_SIZE,
] as const;
const get = (index: number) => {
if (index >= length) {
return undefined;
}
const [arrayIndex, bitIndex] = convert(index);
if (bit.get(populated[arrayIndex], bitIndex) === false) {
return undefined;
}
return bit.get(store[arrayIndex], bitIndex);
}
const set = (index: number, value: boolean) => {
const [arrayIndex, bitIndex] = convert(index);
store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, value);
populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex);
length = Math.max(length, index + 1);
};
const clear = (index: number) => {
const [arrayIndex, bitIndex] = convert(index);
// I think I can skip the store because it is covered by the populated list
// store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, false);
populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex, false);
length = Math.max(length, index);
}
// initial population of array
for (const [i, v] of data.entries()) {
set(i, v);
}
return new Proxy<BitArray>([], {
get(target, property, receiver) {
if (property === Symbol.species) {
return 'BitArray'
}
if (typeof property === 'symbol') {
return undefined;
}
const index = parseIndex(property);
if (index) {
console.log(store.map(i => i.toString(2)), populated.map(i => i.toString(2)));
return get(index);
}
console.log(property, index);
},
set(target, property, value, receiver) {
if (typeof property === 'symbol') {
return false;
}
const index = parseIndex(property);
if (index) {
if (typeof value !== 'boolean') {
throw new Error(`Only able to set boolean values on indices, received '${typeof value}' instead`)
}
set(index, value);
return true;
}
return false;
},
deleteProperty(target, property) {
if (typeof property === 'symbol') {
return false;
}
const index = parseIndex(property);
if (index) {
clear(index);
return true;
}
return false;
},
});
};
const BLOCK_SIZE = 512;
const CHUNK_SIZE = 16;
const UINT32_BYTE_SIZE = 4;
const HASH_NUMBER_OF_UINT32 = 5;
const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE;
const initalizationVector /* 20 bytes */ = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const;
const hashKey /* 16 bytes */ = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const;
type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number'
type Chunk = Iterable<Word> & { length: typeof HASH_NUMBER_OF_UINT32 };
type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 };
const _hash = (data: string | Uint8Array | Uint32Array) => {
// Normalize data to byte array
if (typeof data === 'string') {
data = new TextEncoder().encode(data);
}
// Normalize to Uint32Array
if (data instanceof Uint8Array) {
data = new Uint32Array(data.buffer, data.byteOffset, data.byteLength / 4);
}
if (!Number.isSafeInteger(data.length)) {
throw new Error('Cannot hash more than 2^53 - 1 bits');
}
// prepare blocks
const output = new Uint32Array(initalizationVector) as HashBytes;
const blocks = range(0, data.length, CHUNK_SIZE, true).map(i => {
const view = data.subarray(i, i + 16);
const words = Array<Word>(80);
words[0] = view[0];
words[1] = view[1];
words[2] = view[2];
words[3] = view[3];
words[4] = view[4];
return words;
});
// apply blocks
for (const words of blocks) {
let [a, b, c, d, e] = output;
for (const index of range(0, 80)) {
if (index >= 16) {
words[index] = circularShiftLeft(1, words[index - 3] ^ words[index - 8] ^ words[index - 14] ^ words[index - 16]);
}
const tmp = (
circularShiftLeft(a, HASH_NUMBER_OF_UINT32) +
logicalHashFunctions(index, b, c, d) +
e +
words[index] +
hashKey[Math.floor(index / HASH_SIZE)]
);
e = d;
d = c;
c = circularShiftLeft(b, 30);
b = a;
a = tmp;
}
output[0] = (output[0] + a) | 0;
output[1] = (output[1] + b) | 0;
output[2] = (output[2] + c) | 0;
output[3] = (output[3] + d) | 0;
output[4] = (output[4] + e) | 0;
}
return output.values().map(word => (word >>> 0).toString(16)).join('');
};
const circularShiftLeft = (subject: number, offset: number): number => {
return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF);
};
const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => {
if (index < HASH_SIZE) {
return (b & c) | (~b & d);
}
else if (index < (2 * HASH_SIZE)) {
return b ^ c ^ d;
}
else if (index < (3 * HASH_SIZE)) {
return (b & c) | (b & d) | (c & d);
}
else if (index < (4 * HASH_SIZE)) {
return b ^ c ^ d;
}
throw new Error('Unreachable code');
};
const range = function* (start: number, end: number, step: number = 1, inclusive: boolean = false): Iterator<number> {
for (let i = start; inclusive ? (i <= end) : (i < end); i += (step ?? 1)) {
yield i;
}
};
export const hash = (data: any): string => {
if (typeof data === 'string' || (typeof data === 'object' && (data instanceof Uint8Array || data instanceof Uint32Array))) {
return _hash(data);
}
return _hash(JSON.stringify(data));
};