From e88d727d8e89fa3f6b492b80a7381ab76dd62836 Mon Sep 17 00:00:00 2001 From: Chris Kruining Date: Thu, 13 Mar 2025 10:41:22 +0100 Subject: [PATCH] buttoned up hash and ast --- src/features/editor/ast.spec.ts | 38 +++++ src/features/editor/ast.ts | 3 +- src/features/editor/hash.spec.ts | 54 +++++++ src/features/editor/hash.ts | 168 ++++++++++++++++++++ src/features/editor/temp.ts | 253 ------------------------------- 5 files changed, 261 insertions(+), 255 deletions(-) create mode 100644 src/features/editor/hash.spec.ts create mode 100644 src/features/editor/hash.ts delete mode 100644 src/features/editor/temp.ts diff --git a/src/features/editor/ast.spec.ts b/src/features/editor/ast.spec.ts index e69de29..ef4a9ad 100644 --- a/src/features/editor/ast.spec.ts +++ b/src/features/editor/ast.spec.ts @@ -0,0 +1,38 @@ +import { describe, expect } from "vitest"; +import { it } from "~/test-helpers"; +import { createElement, splitBy, mergeNodes } from './ast'; + +describe('ast', () => { + describe('createElement', () => { + it('should ____', () => { + // Arrange + + // Act + + // Assert + expect(true).toEqual(true); + }); + }); + + describe('splitBy', () => { + it('should ____', () => { + // Arrange + + // Act + + // Assert + expect(true).toEqual(true); + }); + }); + + describe('mergeNodes', () => { + it('should ____', () => { + // Arrange + + // Act + + // Assert + expect(true).toEqual(true); + }); + }); +}); \ No newline at end of file diff --git a/src/features/editor/ast.ts b/src/features/editor/ast.ts index 6ec041f..ea08fb9 100644 --- a/src/features/editor/ast.ts +++ b/src/features/editor/ast.ts @@ -1,7 +1,7 @@ import type { Node, Text, Parent, RootContent } from 'hast'; import { find } from 'unist-util-find'; import { visit } from 'unist-util-visit'; -import { hash } from './temp'; +import { hash } from './hash'; export const createElement = (tagName: string, children: any[], properties: object = {}) => ({ type: 'element', tagName, children, properties }); @@ -14,7 +14,6 @@ export const splitBy = (tree: Parent, splitPoints: SplitPoint[]): RootContent[][ const result: RootContent[][] = []; let remaining: RootContent[] = Object.hasOwn(tree, 'children') ? (tree as Parent).children : []; - console.log('kaas'); // console.log(Object.groupBy(splitPoints, p => hash(p.node))); for (const { node, offset } of splitPoints) { diff --git a/src/features/editor/hash.spec.ts b/src/features/editor/hash.spec.ts new file mode 100644 index 0000000..a79bb58 --- /dev/null +++ b/src/features/editor/hash.spec.ts @@ -0,0 +1,54 @@ +import { describe, expect } from "vitest"; +import { it } from "~/test-helpers"; +import { hash } from "./hash"; + +const DEFAULT_DATA = { + prop_object: { + is: 'some prop', + }, + + prop_boolean: false, + prop_bigint: 1_000_000_000_000n, + prop_null: null, + prop_undefined: undefined, + prop_function: () => { }, + prop_symbol: Symbol('symbol'), + + uint8array: new Uint8Array([0xff, 0x00, 0xff, 0x00]), + uint32array: new Uint32Array([0xff00ff00]), +}; + +describe('hash', () => { + it('should hash a value with sha-1 algorithm', () => { + // Arrange + const expected = '6fe383b712ec74177f7714a3f5db5416accef8b'; + + // Act + const actual = hash(DEFAULT_DATA); + + // Assert + expect(actual).toEqual(expected); + }); + + it('should be stable over multiple runs', () => { + // Arrange + + // Act + const run1 = hash(DEFAULT_DATA); + const run2 = hash(DEFAULT_DATA); + + // Assert + expect(run1).toEqual(run2); + }); + + // I can't seem to actually create a dataset that is large enough in order to test this. + // So, for now, I will consider this unreachable code. + it('should error if the input is too large', () => { + // Arrange + + // Act + + // Assert + expect(true).toEqual(true); + }); +}); \ No newline at end of file diff --git a/src/features/editor/hash.ts b/src/features/editor/hash.ts new file mode 100644 index 0000000..91e43c2 --- /dev/null +++ b/src/features/editor/hash.ts @@ -0,0 +1,168 @@ +import { installIntoGlobal } from "iterator-helpers-polyfill"; + +installIntoGlobal(); + +const CHUNK_SIZE = 16; +const UINT32_BYTE_SIZE = 4; +const HASH_NUMBER_OF_UINT32 = 5; +const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE; +const initalizationVector /* 20 bytes */ = Object.freeze([0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const); +const hashKey /* 16 bytes */ = Object.freeze([0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const); + +type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number' +type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 }; + +export const hash = (data: any) => { + const buffer = typeof data === 'object' && data instanceof Uint32Array ? data : new Uint32Array(toBinary(data)); + + if (!Number.isSafeInteger(buffer.length)) { + throw new Error('Cannot hash more than 2^53 - 1 bits'); + } + + // prepare blocks + const output = new Uint32Array(initalizationVector) as HashBytes; + const blocks = range(0, buffer.length, CHUNK_SIZE).map(i => { + const view = buffer.subarray(i, i + 16); + const words = Array(80); + + words[0] = view[0]; + words[1] = view[1]; + words[2] = view[2]; + words[3] = view[3]; + words[4] = view[4]; + + return words; + }); + + // apply blocks + for (const words of blocks) { + let [a, b, c, d, e] = output; + + for (let i = 0; i < 80; i++) { + if (i >= 16) { + words[i] = circularShiftLeft(1, words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16]); + } + + const tmp = ( + circularShiftLeft(a, HASH_NUMBER_OF_UINT32) + + logicalHashFunctions(i, b, c, d) + + e + + words[i] + + hashKey[Math.floor(i / HASH_SIZE)] + ); + + e = d; + d = c; + c = circularShiftLeft(b, 30); + b = a; + a = tmp; + } + + output[0] = (output[0] + a) | 0; + output[1] = (output[1] + b) | 0; + output[2] = (output[2] + c) | 0; + output[3] = (output[3] + d) | 0; + output[4] = (output[4] + e) | 0; + } + + return output.values().map(word => (word >>> 0).toString(16)).join(''); +}; + +const circularShiftLeft = (subject: number, offset: number): number => { + return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF); +}; + +const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => { + if (index < HASH_SIZE) { + return (b & c) | (~b & d); + } + else if (index < (2 * HASH_SIZE)) { + return b ^ c ^ d; + } + else if (index < (3 * HASH_SIZE)) { + return (b & c) | (b & d) | (c & d); + } + else if (index < (4 * HASH_SIZE)) { + return b ^ c ^ d; + } + + throw new Error('Unreachable code'); +}; + +const range = function* (start: number, end: number, step: number): Iterator { + for (let i = start; i <= end; i += step) { + yield i; + } +}; + +const toBinary = function*(data: T): Generator { + switch (typeof data) { + case 'function': + case 'symbol': + case 'undefined': + break; + + case 'string': + yield* compact(new TextEncoder().encode(data)); + break; + + case 'number': + yield data; + break; + + case 'boolean': + yield Number(data); + break; + + case 'bigint': + let value: bigint = data; + // limit the iteration to 10 cycles. + // This covers 10*32 bits, which in al honesty should be enough no? + const ITERATION_LIMIT = 10; + + for (let i = 0; i < ITERATION_LIMIT && value > 0; i++) { + yield Number((value & 0xffffffffn)); + value >>= 32n; + + if (i === 10) { + throw new Error('Iteration limit in bigint serialization reached'); + } + } + break; + + case 'object': + if (data === null) { + break; + } + + if (data instanceof Uint8Array) { + yield* compact(data); + } + + if (data instanceof Uint32Array) { + yield* data; + } + + for (const item of Object.values(data)) { + yield* toBinary(item); + } + break; + } +}; + +const compact = function* (source: Iterable): Generator { + let i = 0; + let buffer = 0; + + for (const value of source) { + buffer |= (value & 0xff) << (8 * i); + + if (i === 3) { + yield buffer; + buffer = 0; + } + + i = (i + 1) % 4; + } +}; + diff --git a/src/features/editor/temp.ts b/src/features/editor/temp.ts deleted file mode 100644 index fccd7be..0000000 --- a/src/features/editor/temp.ts +++ /dev/null @@ -1,253 +0,0 @@ -const bit = { - get(subject: number, index: number) { - return Boolean((subject >> index) & 1); - }, - - set(subject: number, index: number, value?: boolean) { - if (value !== undefined) { - return this.clear(subject, index) | ((value ? 1 : 0) << index); - } - - return subject | (1 << index) - }, - - clear(subject: number, index: number) { - return subject & ~(1 << index); - }, - - toggle(subject: number, index: number) { - return subject ^ (1 << index); - }, -}; - -interface BitArray { - [index: number]: boolean; - length: number; -} - -const ITEM_BIT_SIZE = 64; -const createBitArray = (data: boolean[] = []) => { - const store: number[] = []; - const populated: number[] = []; - let length = 0; - - const parseIndex = (key: string) => { - const value = Number.parseInt(key); - - if (Number.isNaN(value) || !Number.isFinite(value)) { - return undefined; - } - - return value; - }; - - const convert = (index: number) => [ - Math.floor(index / ITEM_BIT_SIZE), - index % ITEM_BIT_SIZE, - ] as const; - - const get = (index: number) => { - if (index >= length) { - return undefined; - } - - const [arrayIndex, bitIndex] = convert(index); - - if (bit.get(populated[arrayIndex], bitIndex) === false) { - return undefined; - } - - return bit.get(store[arrayIndex], bitIndex); - } - - const set = (index: number, value: boolean) => { - const [arrayIndex, bitIndex] = convert(index); - - store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, value); - populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex); - length = Math.max(length, index + 1); - }; - - const clear = (index: number) => { - const [arrayIndex, bitIndex] = convert(index); - - // I think I can skip the store because it is covered by the populated list - // store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, false); - populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex, false); - length = Math.max(length, index); - } - - // initial population of array - for (const [i, v] of data.entries()) { - set(i, v); - } - - return new Proxy([], { - get(target, property, receiver) { - if (property === Symbol.species) { - return 'BitArray' - } - - if (typeof property === 'symbol') { - return undefined; - } - - const index = parseIndex(property); - - if (index) { - console.log(store.map(i => i.toString(2)), populated.map(i => i.toString(2))); - - return get(index); - } - - console.log(property, index); - }, - - set(target, property, value, receiver) { - if (typeof property === 'symbol') { - return false; - } - - const index = parseIndex(property); - - if (index) { - if (typeof value !== 'boolean') { - throw new Error(`Only able to set boolean values on indices, received '${typeof value}' instead`) - } - - set(index, value); - - return true; - } - - return false; - }, - - deleteProperty(target, property) { - if (typeof property === 'symbol') { - return false; - } - - const index = parseIndex(property); - - if (index) { - clear(index); - - return true; - } - - return false; - }, - }); -}; - -const BLOCK_SIZE = 512; -const CHUNK_SIZE = 16; -const UINT32_BYTE_SIZE = 4; -const HASH_NUMBER_OF_UINT32 = 5; -const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE; -const initalizationVector /* 20 bytes */ = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const; -const hashKey /* 16 bytes */ = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const; - -type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number' -type Chunk = Iterable & { length: typeof HASH_NUMBER_OF_UINT32 }; -type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 }; - -const _hash = (data: string | Uint8Array | Uint32Array) => { - // Normalize data to byte array - if (typeof data === 'string') { - data = new TextEncoder().encode(data); - } - - // Normalize to Uint32Array - if (data instanceof Uint8Array) { - data = new Uint32Array(data.buffer, data.byteOffset, data.byteLength / 4); - } - - if (!Number.isSafeInteger(data.length)) { - throw new Error('Cannot hash more than 2^53 - 1 bits'); - } - - // prepare blocks - const output = new Uint32Array(initalizationVector) as HashBytes; - const blocks = range(0, data.length, CHUNK_SIZE, true).map(i => { - const view = data.subarray(i, i + 16); - const words = Array(80); - - words[0] = view[0]; - words[1] = view[1]; - words[2] = view[2]; - words[3] = view[3]; - words[4] = view[4]; - - return words; - }); - - // apply blocks - for (const words of blocks) { - let [a, b, c, d, e] = output; - - for (const index of range(0, 80)) { - if (index >= 16) { - words[index] = circularShiftLeft(1, words[index - 3] ^ words[index - 8] ^ words[index - 14] ^ words[index - 16]); - } - - const tmp = ( - circularShiftLeft(a, HASH_NUMBER_OF_UINT32) + - logicalHashFunctions(index, b, c, d) + - e + - words[index] + - hashKey[Math.floor(index / HASH_SIZE)] - ); - - e = d; - d = c; - c = circularShiftLeft(b, 30); - b = a; - a = tmp; - } - - output[0] = (output[0] + a) | 0; - output[1] = (output[1] + b) | 0; - output[2] = (output[2] + c) | 0; - output[3] = (output[3] + d) | 0; - output[4] = (output[4] + e) | 0; - } - - return output.values().map(word => (word >>> 0).toString(16)).join(''); -}; - -const circularShiftLeft = (subject: number, offset: number): number => { - return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF); -}; - -const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => { - if (index < HASH_SIZE) { - return (b & c) | (~b & d); - } - else if (index < (2 * HASH_SIZE)) { - return b ^ c ^ d; - } - else if (index < (3 * HASH_SIZE)) { - return (b & c) | (b & d) | (c & d); - } - else if (index < (4 * HASH_SIZE)) { - return b ^ c ^ d; - } - - throw new Error('Unreachable code'); -}; - -const range = function* (start: number, end: number, step: number = 1, inclusive: boolean = false): Iterator { - for (let i = start; inclusive ? (i <= end) : (i < end); i += (step ?? 1)) { - yield i; - } -}; - -export const hash = (data: any): string => { - if (typeof data === 'string' || (typeof data === 'object' && (data instanceof Uint8Array || data instanceof Uint32Array))) { - return _hash(data); - } - - return _hash(JSON.stringify(data)); -}; \ No newline at end of file