buttoned up hash and ast

2025-03-13 10:41:22 +01:00 · 2025-03-13 10:41:22 +01:00 · e88d727d8e
commit e88d727d8e
parent b1e617e74a
5 changed files with 261 additions and 255 deletions
--- a/src/features/editor/ast.spec.ts
+++ b/src/features/editor/ast.spec.ts
@ -0,0 +1,38 @@
+import { describe, expect } from "vitest";
+import { it } from "~/test-helpers";
+import { createElement, splitBy, mergeNodes } from './ast';
+
+describe('ast', () => {
+    describe('createElement', () => {
+        it('should ____', () => {
+            // Arrange
+
+            // Act
+
+            // Assert
+            expect(true).toEqual(true);
+        });
+    });
+
+    describe('splitBy', () => {
+        it('should ____', () => {
+            // Arrange
+
+            // Act
+
+            // Assert
+            expect(true).toEqual(true);
+        });
+    });
+
+    describe('mergeNodes', () => {
+        it('should ____', () => {
+            // Arrange
+
+            // Act
+
+            // Assert
+            expect(true).toEqual(true);
+        });
+    });
+});
--- a/src/features/editor/ast.ts
+++ b/src/features/editor/ast.ts
@ -1,7 +1,7 @@
 import type { Node, Text, Parent, RootContent } from 'hast';
 import { find } from 'unist-util-find';
 import { visit } from 'unist-util-visit';
-import { hash } from './temp';
+import { hash } from './hash';

 export const createElement = (tagName: string, children: any[], properties: object = {}) => ({ type: 'element', tagName, children, properties });

@ -14,7 +14,6 @@ export const splitBy = (tree: Parent, splitPoints: SplitPoint[]): RootContent[][
    const result: RootContent[][] = [];
    let remaining: RootContent[] = Object.hasOwn(tree, 'children') ? (tree as Parent).children : [];

-    console.log('kaas');
    // console.log(Object.groupBy(splitPoints, p => hash(p.node)));

    for (const { node, offset } of splitPoints) {
--- a/src/features/editor/hash.spec.ts
+++ b/src/features/editor/hash.spec.ts
@ -0,0 +1,54 @@
+import { describe, expect } from "vitest";
+import { it } from "~/test-helpers";
+import { hash } from "./hash";
+
+const DEFAULT_DATA = {
+    prop_object: {
+        is: 'some prop',
+    },
+
+    prop_boolean: false,
+    prop_bigint: 1_000_000_000_000n,
+    prop_null: null,
+    prop_undefined: undefined,
+    prop_function: () => { },
+    prop_symbol: Symbol('symbol'),
+
+    uint8array: new Uint8Array([0xff, 0x00, 0xff, 0x00]),
+    uint32array: new Uint32Array([0xff00ff00]),
+};
+
+describe('hash', () => {
+    it('should hash a value with sha-1 algorithm', () => {
+        // Arrange
+        const expected = '6fe383b712ec74177f7714a3f5db5416accef8b';
+
+        // Act
+        const actual = hash(DEFAULT_DATA);
+
+        // Assert
+        expect(actual).toEqual(expected);
+    });
+
+    it('should be stable over multiple runs', () => {
+        // Arrange
+
+        // Act
+        const run1 = hash(DEFAULT_DATA);
+        const run2 = hash(DEFAULT_DATA);
+
+        // Assert
+        expect(run1).toEqual(run2);
+    });
+
+    // I can't seem to actually create a dataset that is large enough in order to test this.
+    // So, for now, I will consider this unreachable code.
+    it('should error if the input is too large', () => {
+        // Arrange
+
+        // Act
+
+        // Assert
+        expect(true).toEqual(true);
+    });
+});
--- a/src/features/editor/hash.ts
+++ b/src/features/editor/hash.ts
@ -0,0 +1,168 @@
+import { installIntoGlobal } from "iterator-helpers-polyfill";
+
+installIntoGlobal();
+
+const CHUNK_SIZE = 16;
+const UINT32_BYTE_SIZE = 4;
+const HASH_NUMBER_OF_UINT32 = 5;
+const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE;
+const initalizationVector /* 20 bytes */ = Object.freeze([0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const);
+const hashKey             /* 16 bytes */ = Object.freeze([0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const);
+
+type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number'
+type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 };
+
+export const hash = (data: any) => {
+    const buffer = typeof data === 'object' && data instanceof Uint32Array ? data : new Uint32Array(toBinary(data));
+
+    if (!Number.isSafeInteger(buffer.length)) {
+        throw new Error('Cannot hash more than 2^53 - 1 bits');
+    }
+
+    // prepare blocks
+    const output = new Uint32Array(initalizationVector) as HashBytes;
+    const blocks = range(0, buffer.length, CHUNK_SIZE).map(i => {
+        const view = buffer.subarray(i, i + 16);
+        const words = Array<Word>(80);
+
+        words[0] = view[0];
+        words[1] = view[1];
+        words[2] = view[2];
+        words[3] = view[3];
+        words[4] = view[4];
+
+        return words;
+    });
+
+    // apply blocks
+    for (const words of blocks) {
+        let [a, b, c, d, e] = output;
+
+        for (let i = 0; i < 80; i++) {
+            if (i >= 16) {
+                words[i] = circularShiftLeft(1, words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16]);
+            }
+
+            const tmp = (
+                circularShiftLeft(a, HASH_NUMBER_OF_UINT32) +
+                logicalHashFunctions(i, b, c, d) +
+                e +
+                words[i] +
+                hashKey[Math.floor(i / HASH_SIZE)]
+            );
+
+            e = d;
+            d = c;
+            c = circularShiftLeft(b, 30);
+            b = a;
+            a = tmp;
+        }
+
+        output[0] = (output[0] + a) | 0;
+        output[1] = (output[1] + b) | 0;
+        output[2] = (output[2] + c) | 0;
+        output[3] = (output[3] + d) | 0;
+        output[4] = (output[4] + e) | 0;
+    }
+
+    return output.values().map(word => (word >>> 0).toString(16)).join('');
+};
+
+const circularShiftLeft = (subject: number, offset: number): number => {
+    return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF);
+};
+
+const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => {
+    if (index < HASH_SIZE) {
+        return (b & c) | (~b & d);
+    }
+    else if (index < (2 * HASH_SIZE)) {
+        return b ^ c ^ d;
+    }
+    else if (index < (3 * HASH_SIZE)) {
+        return (b & c) | (b & d) | (c & d);
+    }
+    else if (index < (4 * HASH_SIZE)) {
+        return b ^ c ^ d;
+    }
+
+    throw new Error('Unreachable code');
+};
+
+const range = function* (start: number, end: number, step: number): Iterator<number> {
+    for (let i = start; i <= end; i += step) {
+        yield i;
+    }
+};
+
+const toBinary = function*<T>(data: T): Generator<number, void, unknown> {
+    switch (typeof data) {
+        case 'function':
+        case 'symbol':
+        case 'undefined':
+            break;
+
+        case 'string':
+            yield* compact(new TextEncoder().encode(data));
+            break;
+
+        case 'number':
+            yield data;
+            break;
+
+        case 'boolean':
+            yield Number(data);
+            break;
+
+        case 'bigint':
+            let value: bigint = data;
+            // limit the iteration to 10 cycles.
+            // This covers 10*32 bits, which in al honesty should be enough no?
+            const ITERATION_LIMIT = 10;
+
+            for (let i = 0; i < ITERATION_LIMIT && value > 0; i++) {
+                yield Number((value & 0xffffffffn));
+                value >>= 32n;
+
+                if (i === 10) {
+                    throw new Error('Iteration limit in bigint serialization reached');
+                }
+            }
+            break;
+
+        case 'object':
+            if (data === null) {
+                break;
+            }
+
+            if (data instanceof Uint8Array) {
+                yield* compact(data);
+            }
+
+            if (data instanceof Uint32Array) {
+                yield* data;
+            }
+
+            for (const item of Object.values(data)) {
+                yield* toBinary(item);
+            }
+            break;
+    }
+};
+
+const compact = function* (source: Iterable<number>): Generator<number, void, unknown> {
+    let i = 0;
+    let buffer = 0;
+
+    for (const value of source) {
+        buffer |= (value & 0xff) << (8 * i);
+
+        if (i === 3) {
+            yield buffer;
+            buffer = 0;
+        }
+
+        i = (i + 1) % 4;
+    }
+};
+
--- a/src/features/editor/temp.ts
+++ b/src/features/editor/temp.ts
@ -1,253 +0,0 @@
-const bit = {
-    get(subject: number, index: number) {
-        return Boolean((subject >> index) & 1);
-    },
-
-    set(subject: number, index: number, value?: boolean) {
-        if (value !== undefined) {
-            return this.clear(subject, index) | ((value ? 1 : 0) << index);
-        }
-
-        return subject | (1 << index)
-    },
-
-    clear(subject: number, index: number) {
-        return subject & ~(1 << index);
-    },
-
-    toggle(subject: number, index: number) {
-        return subject ^ (1 << index);
-    },
-};
-
-interface BitArray {
-    [index: number]: boolean;
-    length: number;
-}
-
-const ITEM_BIT_SIZE = 64;
-const createBitArray = (data: boolean[] = []) => {
-    const store: number[] = [];
-    const populated: number[] = [];
-    let length = 0;
-
-    const parseIndex = (key: string) => {
-        const value = Number.parseInt(key);
-
-        if (Number.isNaN(value) || !Number.isFinite(value)) {
-            return undefined;
-        }
-
-        return value;
-    };
-
-    const convert = (index: number) => [
-        Math.floor(index / ITEM_BIT_SIZE),
-        index % ITEM_BIT_SIZE,
-    ] as const;
-
-    const get = (index: number) => {
-        if (index >= length) {
-            return undefined;
-        }
-
-        const [arrayIndex, bitIndex] = convert(index);
-
-        if (bit.get(populated[arrayIndex], bitIndex) === false) {
-            return undefined;
-        }
-
-        return bit.get(store[arrayIndex], bitIndex);
-    }
-
-    const set = (index: number, value: boolean) => {
-        const [arrayIndex, bitIndex] = convert(index);
-
-        store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, value);
-        populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex);
-        length = Math.max(length, index + 1);
-    };
-
-    const clear = (index: number) => {
-        const [arrayIndex, bitIndex] = convert(index);
-
-        // I think I can skip the store because it is covered by the populated list
-        // store[arrayIndex] = bit.set((store[arrayIndex] ?? 0), bitIndex, false);
-        populated[arrayIndex] = bit.set((populated[arrayIndex] ?? 0), bitIndex, false);
-        length = Math.max(length, index);
-    }
-
-    // initial population of array
-    for (const [i, v] of data.entries()) {
-        set(i, v);
-    }
-
-    return new Proxy<BitArray>([], {
-        get(target, property, receiver) {
-            if (property === Symbol.species) {
-                return 'BitArray'
-            }
-
-            if (typeof property === 'symbol') {
-                return undefined;
-            }
-
-            const index = parseIndex(property);
-
-            if (index) {
-                console.log(store.map(i => i.toString(2)), populated.map(i => i.toString(2)));
-
-                return get(index);
-            }
-
-            console.log(property, index);
-        },
-
-        set(target, property, value, receiver) {
-            if (typeof property === 'symbol') {
-                return false;
-            }
-
-            const index = parseIndex(property);
-
-            if (index) {
-                if (typeof value !== 'boolean') {
-                    throw new Error(`Only able to set boolean values on indices, received '${typeof value}' instead`)
-                }
-
-                set(index, value);
-
-                return true;
-            }
-
-            return false;
-        },
-
-        deleteProperty(target, property) {
-            if (typeof property === 'symbol') {
-                return false;
-            }
-
-            const index = parseIndex(property);
-
-            if (index) {
-                clear(index);
-
-                return true;
-            }
-
-            return false;
-        },
-    });
-};
-
-const BLOCK_SIZE = 512;
-const CHUNK_SIZE = 16;
-const UINT32_BYTE_SIZE = 4;
-const HASH_NUMBER_OF_UINT32 = 5;
-const HASH_SIZE = HASH_NUMBER_OF_UINT32 * UINT32_BYTE_SIZE;
-const initalizationVector /* 20 bytes */ = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0] as const;
-const hashKey             /* 16 bytes */ = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6] as const;
-
-type Word = number & {}; // union with empty object so typescript show this as 'Word' and not as 'number'
-type Chunk = Iterable<Word> & { length: typeof HASH_NUMBER_OF_UINT32 };
-type HashBytes = Uint32Array & { length: typeof HASH_NUMBER_OF_UINT32 };
-
-const _hash = (data: string | Uint8Array | Uint32Array) => {
-    // Normalize data to byte array
-    if (typeof data === 'string') {
-        data = new TextEncoder().encode(data);
-    }
-
-    // Normalize to Uint32Array
-    if (data instanceof Uint8Array) {
-        data = new Uint32Array(data.buffer, data.byteOffset, data.byteLength / 4);
-    }
-
-    if (!Number.isSafeInteger(data.length)) {
-        throw new Error('Cannot hash more than 2^53 - 1 bits');
-    }
-
-    // prepare blocks
-    const output = new Uint32Array(initalizationVector) as HashBytes;
-    const blocks = range(0, data.length, CHUNK_SIZE, true).map(i => {
-        const view = data.subarray(i, i + 16);
-        const words = Array<Word>(80);
-
-        words[0] = view[0];
-        words[1] = view[1];
-        words[2] = view[2];
-        words[3] = view[3];
-        words[4] = view[4];
-
-        return words;
-    });
-
-    // apply blocks
-    for (const words of blocks) {
-        let [a, b, c, d, e] = output;
-
-        for (const index of range(0, 80)) {
-            if (index >= 16) {
-                words[index] = circularShiftLeft(1, words[index - 3] ^ words[index - 8] ^ words[index - 14] ^ words[index - 16]);
-            }
-
-            const tmp = (
-                circularShiftLeft(a, HASH_NUMBER_OF_UINT32) +
-                logicalHashFunctions(index, b, c, d) +
-                e +
-                words[index] +
-                hashKey[Math.floor(index / HASH_SIZE)]
-            );
-
-            e = d;
-            d = c;
-            c = circularShiftLeft(b, 30);
-            b = a;
-            a = tmp;
-        }
-
-        output[0] = (output[0] + a) | 0;
-        output[1] = (output[1] + b) | 0;
-        output[2] = (output[2] + c) | 0;
-        output[3] = (output[3] + d) | 0;
-        output[4] = (output[4] + e) | 0;
-    }
-
-    return output.values().map(word => (word >>> 0).toString(16)).join('');
-};
-
-const circularShiftLeft = (subject: number, offset: number): number => {
-    return ((subject << offset) | (subject >>> 32 - offset)) & (0xFFFFFFFF);
-};
-
-const logicalHashFunctions = (index: number, b: Word, c: Word, d: Word): Word => {
-    if (index < HASH_SIZE) {
-        return (b & c) | (~b & d);
-    }
-    else if (index < (2 * HASH_SIZE)) {
-        return b ^ c ^ d;
-    }
-    else if (index < (3 * HASH_SIZE)) {
-        return (b & c) | (b & d) | (c & d);
-    }
-    else if (index < (4 * HASH_SIZE)) {
-        return b ^ c ^ d;
-    }
-
-    throw new Error('Unreachable code');
-};
-
-const range = function* (start: number, end: number, step: number = 1, inclusive: boolean = false): Iterator<number> {
-    for (let i = start; inclusive ? (i <= end) : (i < end); i += (step ?? 1)) {
-        yield i;
-    }
-};
-
-export const hash = (data: any): string => {
-    if (typeof data === 'string' || (typeof data === 'object' && (data instanceof Uint8Array || data instanceof Uint32Array))) {
-        return _hash(data);
-    }
-
-    return _hash(JSON.stringify(data));
-};