MediaWiki:Gadget-LabelScanIndexer.js: Unterschied zwischen den Versionen
Admin (Diskussion | Beiträge) Keine Bearbeitungszusammenfassung |
Admin (Diskussion | Beiträge) Keine Bearbeitungszusammenfassung |
||
| (7 dazwischenliegende Versionen desselben Benutzers werden nicht angezeigt) | |||
| Zeile 1: | Zeile 1: | ||
/* Gadget: LabelScanIndexer (Auto-Save, | /* Gadget: LabelScanIndexer (Auto-Save, lokal, ESM via externem Shim) | ||
* Erzeugt Embeddings lokal (CLIP) und speichert in MediaWiki:Gadget-LabelScan-index.json | * Erzeugt Embeddings lokal (CLIP) und speichert in MediaWiki:Gadget-LabelScan-index.json | ||
* Läuft nur auf "Hilfe:LabelScan-Indexer" | * Läuft nur auf "Hilfe:LabelScan-Indexer" | ||
| Zeile 16: | Zeile 16: | ||
var INDEX_TITLE = 'MediaWiki:Gadget-LabelScan-index.json'; | var INDEX_TITLE = 'MediaWiki:Gadget-LabelScan-index.json'; | ||
// ---------- Pfade | // ---------- Lokale Pfade ---------- | ||
var TRANSFORMERS_SHIM = '/vendor/transformers/esm-shim.js'; // neu: externes Modul | |||
var | var WASM_DIR = '/vendor/transformers/'; // enthält ort-wasm*.wasm | ||
var WASM_DIR = ' | var MODEL_ID = 'Xenova/clip-vit-base-patch32'; | ||
var LOCAL_MODEL_PATH = '/models'; | |||
var | // Files für Sanity-Check | ||
var CHECK_URLS = [ | |||
LOCAL_MODEL_PATH + '/Xenova/clip-vit-base-patch32/preprocessor_config.json', | |||
LOCAL_MODEL_PATH + '/Xenova/clip-vit-base-patch32/onnx/vision_model_quantized.onnx' | |||
]; | |||
// ---------- | // ---------- Helpers ---------- | ||
function $(id) { return document.getElementById(id); } | function $(id) { return document.getElementById(id); } | ||
function status(t) { var el = $('idx-status'); if (el) el.textContent = t || ''; } | function status(t) { var el = $('idx-status'); if (el) el.textContent = t || ''; } | ||
function log(){ try{ console.log.apply(console, ['[LabelScanIndexer]'].concat([].slice.call(arguments))); }catch(_){} } | |||
function warn(){ try{ console.warn.apply(console, ['[LabelScanIndexer]'].concat([].slice.call(arguments))); }catch(_){} } | |||
function err(){ try{ console.error.apply(console, ['[LabelScanIndexer]'].concat([].slice.call(arguments))); }catch(_){} } | |||
function hasInterfaceRight() { | function hasInterfaceRight() { | ||
| Zeile 46: | Zeile 53: | ||
} | } | ||
function fileToCanvasExif(file) { | function fileToCanvasExif(file) { | ||
return new Promise(function (resolve, reject) { | return new Promise(function (resolve, reject) { | ||
| Zeile 87: | Zeile 93: | ||
} | } | ||
// ---------- | function timeoutPromise(p, ms, label) { | ||
return new Promise(function (resolve, reject) { | |||
var to = setTimeout(function(){ reject(new Error('Timeout: ' + (label||'operation') + ' nach ' + ms + ' ms')); }, ms); | |||
p.then(function(x){ clearTimeout(to); resolve(x); }, function(e){ clearTimeout(to); reject(e); }); | |||
}); | |||
} | |||
function headOk(url) { | |||
return fetch(url, { method: 'GET', cache: 'no-store' }).then(function(res){ | |||
if (!res.ok) throw new Error('HTTP '+res.status+' bei '+url); | |||
return true; | |||
}); | |||
} | |||
function preflightCheck() { | |||
log('Preflight-Check…'); | |||
return Promise.all(CHECK_URLS.map(function(u){ | |||
return timeoutPromise(headOk(u), 8000, 'check '+u).then(function(){ log('OK', u); return true; }, function(e){ throw new Error('Fehler beim Laden: '+u+'\n→ '+e.message); }); | |||
})); | |||
} | |||
// ---------- ESM laden über externes Modul (kein inline) ---------- | |||
function loadModuleFile(url) { | |||
return new Promise(function (resolve, reject) { | |||
var s = document.createElement('script'); | |||
s.type = 'module'; | |||
s.src = url; | |||
s.onload = function () { resolve(); }; | |||
s.onerror = function () { reject(new Error('Module load failed: ' + url)); }; | |||
document.head.appendChild(s); | |||
}); | |||
} | |||
var _libPromise = null; | var _libPromise = null; | ||
function ensureLib() { | function ensureLib() { | ||
if (_libPromise) return _libPromise; | if (_libPromise) return _libPromise; | ||
_libPromise = | _libPromise = preflightCheck().then(function(){ | ||
log('lade Transformers (ESM via Shim)…', TRANSFORMERS_SHIM); | |||
return loadModuleFile(TRANSFORMERS_SHIM).then(function () { | |||
var t0 = Date.now(); | |||
return new Promise(function (resolve, reject) { | |||
(function spin() { | |||
if (window.transformers && typeof window.transformers === 'object') { | |||
// Env konfigurieren | |||
var env = window.transformers.env; | |||
env.allowLocalModels = true; | |||
env.allowRemoteModels = false; | |||
env.localModelPath = LOCAL_MODEL_PATH; | |||
env.backends = env.backends || {}; | |||
env.backends.onnx = env.backends.onnx || {}; | |||
// Forciere WASM (WebGPU kann je nach Browser/CSP zicken) | |||
env.backends.onnx.preferredBackend = 'wasm'; | |||
env.backends.onnx.wasm = env.backends.onnx.wasm || {}; | |||
env.backends.onnx.wasm.wasmPaths = WASM_DIR; | |||
log('Transformers bereit.'); | |||
resolve(window.transformers); | |||
} else if (Date.now() - t0 > 10000) { | |||
reject(new Error('Transformers-ESM nicht verfügbar (Timeout).')); | |||
} else { | |||
setTimeout(spin, 50); | |||
} | |||
})(); | |||
}); | |||
}); | |||
}); | }); | ||
return _libPromise; | return _libPromise; | ||
| Zeile 114: | Zeile 167: | ||
if (_modelPromise) return _modelPromise; | if (_modelPromise) return _modelPromise; | ||
_modelPromise = ensureLib().then(function (tf) { | _modelPromise = ensureLib().then(function (tf) { | ||
status('Modell laden …'); | |||
log('lade Processor & Model…', MODEL_ID); | |||
var p = Promise.all([ | |||
tf.AutoProcessor.from_pretrained(MODEL_ID), | tf.AutoProcessor.from_pretrained(MODEL_ID), | ||
tf.CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, { quantized: true }) | tf.CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, { quantized: true }) | ||
| Zeile 121: | Zeile 177: | ||
try { | try { | ||
var backend = (pack.model && pack.model.session && pack.model.session.executionProvider) || 'unknown'; | var backend = (pack.model && pack.model.session && pack.model.session.executionProvider) || 'unknown'; | ||
log('Modell geladen | Backend:', backend); | |||
} catch (e) { | } catch (e) { log('Modell geladen'); } | ||
return pack; | return pack; | ||
}); | }); | ||
return timeoutPromise(p, 25000, 'Model from_pretrained'); | |||
}); | }); | ||
return _modelPromise; | return _modelPromise; | ||
} | } | ||
| Zeile 131: | Zeile 190: | ||
function buildEmbeddingFromFile(file) { | function buildEmbeddingFromFile(file) { | ||
return ensureModel().then(function (pack) { | return ensureModel().then(function (pack) { | ||
return fileToCanvasExif(file).then(function (canvas) { | status('Bild vorbereiten …'); | ||
return canvasToBlobPromise(canvas).then(function (blob) { | return timeoutPromise(fileToCanvasExif(file), 8000, 'Canvas aus Bild').then(function (canvas) { | ||
return pack.mod.RawImage.fromBlob(blob).then(function (raw) { | return timeoutPromise(canvasToBlobPromise(canvas), 8000, 'Canvas→Blob').then(function (blob) { | ||
return pack.processor(raw, { return_tensors: 'pt' }).then(function (inputs) { | status('Bild analysieren …'); | ||
return pack.model.forward({ pixel_values: inputs.pixel_values }).then(function (out) { | return timeoutPromise(pack.mod.RawImage.fromBlob(blob), 8000, 'RawImage').then(function (raw) { | ||
return timeoutPromise(pack.processor(raw, { return_tensors: 'pt' }), 12000, 'Processor').then(function (inputs) { | |||
return timeoutPromise(pack.model.forward({ pixel_values: inputs.pixel_values }), 20000, 'Model forward').then(function (out) { | |||
var vec = (out && out.image_embeds && out.image_embeds.data) || (out && out.image_embeds); | var vec = (out && out.image_embeds && out.image_embeds.data) || (out && out.image_embeds); | ||
if (!(vec instanceof Float32Array)) throw new Error('Embedding-Format unerwartet'); | if (!(vec instanceof Float32Array)) throw new Error('Embedding-Format unerwartet'); | ||
| Zeile 166: | Zeile 227: | ||
} | } | ||
function saveIndexJSON(newArray, summary) { | |||
return mw.loader.using(['mediawiki.api']).then(function () { | |||
var api = new mw.Api(); | |||
var text = JSON.stringify(newArray, null, 2) + '\n'; | |||
function doEdit() { | |||
return api.postWithToken('csrf', { | return api.postWithToken('csrf', { | ||
action: 'edit', | action: 'edit', | ||
| Zeile 178: | Zeile 241: | ||
bot: 1 | bot: 1 | ||
}); | }); | ||
} | |||
// 1. Versuch | |||
return doEdit()["catch"](function (e) { | |||
// Prüfen, ob es ein badtoken war | |||
var code = (e && e.code) || | |||
(e && e.error && e.error.code) || | |||
null; | |||
if (code === 'badtoken') { | |||
warn('badtoken – versuche mit neuem Token erneut …', e); | |||
// neues Api-Objekt, zweiter Versuch | |||
api = new mw.Api(); | |||
return doEdit(); | |||
} | |||
// anderer Fehler -> normal weiterwerfen | |||
throw e; | |||
}); | }); | ||
}); | |||
} | |||
// ---------- Neu: Duplikat-Erkennung über EMBED ---------- | |||
function findEntryByEmbed(indexArray, embedB64) { | |||
if (!indexArray || !indexArray.length || !embedB64) return null; | |||
for (var i = 0; i < indexArray.length; i++) { | |||
var it = indexArray[i]; | |||
if (!it || typeof it.embed !== 'string') continue; | |||
if (it.embed === embedB64) { | |||
return it; // Duplikat gefunden | |||
} | |||
} | |||
return null; | |||
} | } | ||
| Zeile 184: | Zeile 279: | ||
var runBtn = document.getElementById('idx-run'); | var runBtn = document.getElementById('idx-run'); | ||
if (!runBtn) { | if (!runBtn) { | ||
warn('Button #idx-run nicht gefunden – ist das HTML auf der Seite eingebunden?'); | |||
} else { | } else { | ||
runBtn.addEventListener('click', function () { | runBtn.addEventListener('click', function () { | ||
| Zeile 205: | Zeile 300: | ||
runBtn.disabled = true; | runBtn.disabled = true; | ||
status('Embedding berechnen …'); | status('Embedding berechnen …'); | ||
log('Start embedding…', title, file && file.name); | |||
buildEmbeddingFromFile(file).then(function (vec) { | buildEmbeddingFromFile(file).then(function (vec) { | ||
| Zeile 213: | Zeile 309: | ||
status('Index laden …'); | status('Index laden …'); | ||
return fetchIndexJSON().then(function (arr) { | return fetchIndexJSON().then(function (arr) { | ||
// NEU: Duplikat-Check über EMBED | |||
var existing = findEntryByEmbed(arr, b64); | |||
if (existing) { | |||
log('Duplikat-Embedding erkannt, nichts gespeichert.', existing); | |||
status('Embedding bereits im Index – nichts gespeichert.'); | |||
alert( | |||
'Dieses Bild (Embedding) ist bereits im LabelScan-Index hinterlegt.\n' + | |||
'Vorhandener Eintrag: "' + (existing.title || 'unbekannt') + '".\n\n' + | |||
'Es wurde nichts geändert.' | |||
); | |||
// Signal nach außen: Speichern übersprungen | |||
return 'SKIP_DUPLICATE'; | |||
} | |||
// Kein Duplikat → anhängen & speichern | |||
arr.push({ title: title, thumb: thumb, embed: b64 }); | arr.push({ title: title, thumb: thumb, embed: b64 }); | ||
status('Speichern …'); | status('Speichern …'); | ||
return saveIndexJSON(arr, 'LabelScan: +1 embedding für "' + title + '"'); | return saveIndexJSON(arr, 'LabelScan: +1 embedding für "' + title + '"'); | ||
}); | }); | ||
}).then(function () { | }).then(function (result) { | ||
status('Gespeichert ✅'); | if (result === 'SKIP_DUPLICATE') { | ||
log('Speichern übersprungen (Duplikat-Embedding).'); | |||
// Status ist oben bereits gesetzt | |||
} else { | |||
status('Gespeichert ✅'); | |||
log('Done.'); | |||
} | |||
})["catch"](function (e) { | })["catch"](function (e) { | ||
err(e); | |||
status('Fehler ❌ ' + (e && e.message ? e.message : e)); | status('Fehler ❌ ' + (e && e.message ? e.message : e)); | ||
alert('Fehler: ' + (e && e.message ? e.message : e)); | alert( | ||
'Fehler beim Erzeugen/Speichern:\n\n' + | |||
(e && e.message ? e.message : e) + | |||
'\n\nPrüfe bitte in der Konsole die [LabelScanIndexer]-Logs.' | |||
); | |||
}).then(function () { | }).then(function () { | ||
runBtn.disabled = false; | runBtn.disabled = false; | ||
| Zeile 229: | Zeile 350: | ||
} | } | ||
log('bereit'); | |||
})(); | })(); | ||