MediaWiki:Gadget-LabelScan.js: Unterschied zwischen den Versionen
Admin (Diskussion | Beiträge) Keine Bearbeitungszusammenfassung Markierung: Zurückgesetzt |
Admin (Diskussion | Beiträge) Keine Bearbeitungszusammenfassung Markierung: Manuelle Zurücksetzung |
||
| Zeile 3: | Zeile 3: | ||
'use strict'; | 'use strict'; | ||
// ========= | // ============================= | ||
// | // KONFIGURATION | ||
// ============================= | |||
// ← Für Tests leer lassen: const ADOS_CATEGORIES = []; | |||
const ADOS_CATEGORIES = [ | const ADOS_CATEGORIES = [ | ||
'Alle A Dream of Scotland Abfüllungen', | 'Alle A Dream of Scotland Abfüllungen', | ||
| Zeile 15: | Zeile 18: | ||
]; | ]; | ||
const KNOWN_TOKENS = [ | const KNOWN_TOKENS = [ | ||
// | // Serien / Reihen | ||
'The Tasteful 8', 'Heroes of Childhood', 'A Dream of Scotland', | |||
'A Dream of Ireland', 'The Fine Art of Whisky', 'Friendly Mr. Z', | |||
'Die Whisky Elfen', 'Rumbastic', | |||
// Brennereien / Regionen | |||
'Ardbeg','Ardmore','Arran','Auchroisk','Ben Nevis','Blair Athol','Bowmore', | 'Ardbeg','Ardmore','Arran','Auchroisk','Ben Nevis','Blair Athol','Bowmore', | ||
'Caol Ila','Clynelish','Glenallachie','Glenrothes','Longmorn','Lagavulin', | 'Caol Ila','Clynelish','Glenallachie','Glenrothes','Longmorn','Lagavulin', | ||
'Tullibardine','Dalmore','Benrinnes','Mortlach','Glenlivet','Inchgower', | 'Tullibardine','Dalmore','Benrinnes','Mortlach','Glenlivet','Inchgower', | ||
' | 'Islay','Speyside','Highland','Lowland','Campbeltown','Ireland','Irland', | ||
// typische Label-Wörter | |||
'Cask Strength',' | 'Cask Strength','First Fill','Bourbon Barrel','Sherry','PX','Oloroso' | ||
]; | ]; | ||
// ========= UI | // ============================= | ||
// UI-Hilfen | |||
// ============================= | |||
function hasUI () { | function hasUI () { | ||
return !!document.getElementById('ados-scan-run') && | return !!document.getElementById('ados-scan-run') && | ||
!!document.getElementById('ados-scan-file'); | !!document.getElementById('ados-scan-file'); | ||
} | } | ||
function setStatus (t) { | function setStatus (t) { | ||
var el = document.getElementById('ados-scan-status'); | |||
if (el) el.textContent = t || ''; | if (el) el.textContent = t || ''; | ||
} | } | ||
function setProgress (p) { | function setProgress (p) { | ||
var bar = document.getElementById('ados-scan-progress'); | |||
if (!bar) return; | if (!bar) return; | ||
if (p == null) { bar.hidden = true; bar.value = 0; } | if (p == null) { bar.hidden = true; bar.value = 0; } | ||
else { bar.hidden = false; bar.value = Math.max(0, Math.min(1, p)); } | else { bar.hidden = false; bar.value = Math.max(0, Math.min(1, p)); } | ||
} | } | ||
function showPreview (file) { | function showPreview (file) { | ||
var url = URL.createObjectURL(file); | |||
var prev = document.getElementById('ados-scan-preview'); | |||
if (prev) { | if (prev) { | ||
prev.innerHTML = '<img alt="Vorschau" src="' + url + '">'; | prev.innerHTML = '<img alt="Vorschau" src="' + url + '">'; | ||
| Zeile 51: | Zeile 63: | ||
} | } | ||
} | } | ||
// ========= | function showOCRText (t) { | ||
var el = document.getElementById('ados-scan-ocr'); | |||
function | if (el) el.textContent = (t || '').trim(); | ||
if ( | } | ||
if (window.Tesseract | // ============================= | ||
// Tesseract laden (nur 1x) | |||
// ============================= | |||
var tesseractReady; | |||
function ensureTesseract () { | |||
if (tesseractReady) return tesseractReady; | |||
tesseractReady = new Promise(function (resolve, reject) { | |||
if (window.Tesseract) return resolve(); | |||
var s = document.createElement('script'); | |||
s.src = 'https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js'; | |||
s.async = true; | |||
s.onload = resolve; | |||
s.onerror = function () { | |||
var s2 = document.createElement('script'); | |||
s2.src = 'https://unpkg.com/tesseract.js@5/dist/tesseract.min.js'; | |||
s2.async = true; | |||
s2.onload = resolve; | |||
s2.onerror = function () { reject(new Error('Tesseract konnte nicht geladen werden')); }; | |||
document.head.appendChild(s2); | |||
}; | |||
document.head.appendChild(s); | |||
document.head.appendChild( | |||
} | |||
}); | }); | ||
return | return tesseractReady; | ||
} | } | ||
// ========= | // ============================= | ||
function | // Vorverarbeitung (OCR) | ||
const MAX = | // Graustufen + Unsharp + adaptive Schwelle | ||
const | // ============================= | ||
const w = | |||
async function preprocessImage (file) { | |||
const img = await new Promise((res, rej) => { | |||
const o = new Image(); | |||
o.onload = () => res(o); | |||
o.onerror = rej; | |||
o.src = URL.createObjectURL(file); | |||
}); | |||
const MAX = 1800; | |||
const s = Math.min(1, (img.width > img.height) ? MAX / img.width : MAX / img.height); | |||
const w = Math.round(img.width * s), h = Math.round(img.height * s); | |||
const c = document.createElement('canvas'); c.width = w; c.height = h; | |||
const | const g = c.getContext('2d', { willReadFrequently: true }); | ||
g.imageSmoothingEnabled = true; | |||
g.drawImage(img, 0, 0, w, h); | |||
// | // → Graustufen | ||
let | let id = g.getImageData(0, 0, w, h), d = id.data; | ||
for (let i=0;i<d.length;i+=4){ | |||
for (let i | const y = 0.2126*d[i] + 0.7152*d[i+1] + 0.0722*d[i+2]; | ||
const | d[i]=d[i+1]=d[i+2]=y; | ||
} | } | ||
g.putImageData(id, 0, 0); | |||
// | // → Unsharp (leichter Hochpass) | ||
id = g.getImageData(0,0,w,h); d = id.data; | |||
const | const copy = new Uint8ClampedArray(d); | ||
const | const idx = (x,y)=>4*(y*w+x); | ||
for (let y=1;y<h-1;y++){ | for (let y=1;y<h-1;y++){ | ||
for (let x=1;x<w-1;x++){ | for (let x=1;x<w-1;x++){ | ||
const i0=idx(x,y), a=copy[i0], b=copy[idx(x-1,y)], c0=copy[idx(x+1,y)], | |||
const | d0=copy[idx(x,y-1)], e=copy[idx(x,y+1)]; | ||
const lap = 4*a - b - c0 - d0 - e; | |||
const v = Math.max(0, Math.min(255, a + 0.3*lap)); | |||
d[i0]=d[i0+1]=d[i0+2]=v; | |||
} | } | ||
} | } | ||
g.putImageData(id,0,0); | |||
// | // → adaptive Schwelle (lokaler Mittelwert) | ||
const win = 25, half = (win|0); | |||
let | id = g.getImageData(0,0,w,h); d = id.data; | ||
for (let y=0;y<h;y++){ | |||
for (let x=0;x<w;x++){ | |||
let sum=0, cnt=0; | |||
for (let yy=Math.max(0,y-half); yy<=Math.min(h-1,y+half); yy+=5){ | |||
for (let xx=Math.max(0,x-half); xx<=Math.min(w-1,x+half); xx+=5){ | |||
sum += d[4*(yy*w+xx)]; | |||
cnt++; | |||
} | |||
} | |||
const thr = (sum/cnt) - 6; | |||
const i = 4*(y*w+x); | |||
const v = d[i] < thr ? 0 : 255; | |||
d[i]=d[i+1]=d[i+2]=v; | |||
} | |||
} | } | ||
g.putImageData(id,0,0); | |||
return c; | return c; | ||
} | } | ||
function | // Hilfsfunktionen für Varianten | ||
const c = document.createElement('canvas'); | function crop(canvas, x, y, w, h){ | ||
const c = document.createElement('canvas'); c.width=w; c.height=h; | |||
c.getContext('2d').drawImage( | c.getContext('2d').drawImage(canvas, x, y, w, h, 0, 0, w, h); | ||
return c; | return c; | ||
} | |||
function rotate(canvas, deg){ | |||
const r = document.createElement('canvas'); | |||
const ctx = r.getContext('2d'); | |||
if (deg % 180 === 0){ r.width=canvas.width; r.height=canvas.height; } | |||
else { r.width=canvas.height; r.height=canvas.width; } | |||
ctx.translate(r.width/2, r.height/2); | |||
ctx.rotate(deg*Math.PI/180); | |||
ctx.drawImage(canvas, -canvas.width/2, -canvas.height/2); | |||
return r; | |||
} | } | ||
function | async function ocrOne(canvas, lang) { | ||
const res = await Tesseract.recognize(canvas, lang, { | |||
// Sparse text funktioniert bei Labels (verschieden orientierte Textblöcke) | |||
const | tessedit_pageseg_mode: 11, | ||
preserve_interword_spaces: 1 | |||
// | |||
}); | }); | ||
return | return { text: (res?.data?.text||'').trim(), conf: res?.data?.confidence||0 }; | ||
} | } | ||
// ========= | // ============================= | ||
// Mehrfach-OCR (Rotationen/Regionen) + Fallback-Sprache | |||
// ============================= | |||
async function runOCR(file){ | |||
const | await ensureTesseract(); | ||
setProgress(0.01); | |||
const base = await preprocessImage(file); | |||
// Kandidatenflächen | |||
const variants = []; | |||
variants.push(base); // komplett | |||
variants.push(crop(base, 0, 0, Math.round(base.width*0.4), base.height)); // linke Spalte | |||
variants.push(crop(base, 0, Math.round(base.height*0.72), base.width, Math.round(base.height*0.28))); // unteres Banner | |||
// + Rotationen | |||
const more = []; | |||
for (const v of variants){ | |||
more.push(v, rotate(v, 90), rotate(v, -90)); | |||
} | |||
for (const | // zwei Sprachmodi testen | ||
for (const | const results = []; | ||
for (const canv of more){ | |||
for (const lang of ['deu+eng','eng']){ | |||
try { | try { | ||
const r = await ocrOne(canv, lang); | |||
const | results.push(r); | ||
} catch(e){ /* einzelne Fehlschläge ignorieren */ } | |||
} catch (e) { | |||
} | } | ||
} | } | ||
setProgress(null); | setProgress(null); | ||
results.sort((a,b)=> (b.conf||0)-(a.conf||0)); | |||
return (results[0]?.text)||''; | |||
} | |||
// ============================= | |||
// Hinweise aus OCR | |||
// ============================= | |||
function extractHints (text) { | function extractHints (text) { | ||
const raw = String(text || '').replace(/\s+/g, ' ').trim(); | const raw = String(text || '').replace(/\s+/g, ' ').trim(); | ||
const | const foundNames = []; | ||
KNOWN_TOKENS.forEach(t => { | KNOWN_TOKENS.forEach(t => { | ||
const re = new RegExp('\\b' + t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '\\b', 'i'); | const re = new RegExp('\\b' + t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '\\b', 'i'); | ||
if (re.test(raw)) | if (re.test(raw)) foundNames.push(t); | ||
}); | }); | ||
const ages = []; | const ages = []; | ||
const ageRe = /\b([1-9]\d?)\s?(?:years?|yo|jahr(?:e)?)\b/gi; | |||
while ((m = ageRe.exec(raw)) !== null) { const n = m[1]; if (!ages.includes(n)) ages.push(n); } | let m; | ||
while ((m = ageRe.exec(raw)) !== null) { | |||
const n = m[1]; if (!ages.includes(n)) ages.push(n); | |||
} | |||
const years = []; | const years = []; | ||
const yearRe = /\b(19|20)\d{2}\b/g; | const yearRe = /\b(19|20)\d{2}\b/g; | ||
while ((m = yearRe.exec(raw)) !== null) { | while ((m = yearRe.exec(raw)) !== null) { | ||
if (!years.includes(m[0])) years.push(m[0]); | |||
} | |||
const wordRe = /\b[A-ZÄÖÜ][A-Za-zÄÖÜäöüß\-]{3,}\b/g; | const wordRe = /\b[A-ZÄÖÜ][A-Za-zÄÖÜäöüß\-]{3,}\b/g; | ||
while (( | const uniq = new Set(); let w; const words = []; | ||
while ((w = wordRe.exec(raw)) !== null) { | |||
const s = w[0]; | |||
if (!uniq.has(s)) { uniq.add(s); words.push(s); if (words.length >= 8) break; } | |||
} | |||
return { names, ages, years, words, raw }; | return { names: foundNames, ages, years, words, raw }; | ||
} | |||
// ============================= | |||
// Suche (3 Pässe) + Fallbacks | |||
// ============================= | |||
function esc (s) { return mw.html.escape(String(s || '')); } | |||
function incatStr () { | |||
return (ADOS_CATEGORIES || []).map(c => 'incategory:"' + c + '"').join(' '); | |||
} | } | ||
async function searchWikiSmart (hints, limit) { | async function searchWikiSmart (hints, limit) { | ||
await mw.loader.using('mediawiki.api'); | await mw.loader.using(['mediawiki.api','mediawiki.util','mediawiki.html']); | ||
const api = new mw.Api(); | const api = new mw.Api(); | ||
const ns0 = 0 | const ns0 = 0; | ||
const MAX = limit || 12; | |||
// PASS 1: | // PASS 1: intitle-Kombis (präzise) | ||
const pass1 = []; | |||
if (hints.names.length) { | if (hints.names.length) { | ||
hints.names.forEach(n => { | hints.names.forEach(n => { | ||
if (hints.ages.length) hints.ages.forEach(a => | if (hints.ages.length) hints.ages.forEach(a => pass1.push(`intitle:"${n}" intitle:${a} ${incatStr()}`)); | ||
if (hints.years.length) hints.years.forEach(y => | if (hints.years.length) hints.years.forEach(y => pass1.push(`intitle:"${n}" "${y}" ${incatStr()}`)); | ||
pass1.push(`intitle:"${n}" ${incatStr()}`); | |||
}); | }); | ||
} | } | ||
// PASS | // PASS 2: gewichtete Volltextsuche | ||
const key = [] | |||
.concat(hints.names.slice(0, 2), hints.ages.slice(0, 1), hints.years.slice(0, 1), hints.words.slice(0, 3)) | |||
.map(x => `"${x}"`).join(' '); | |||
const pass2 = key ? [ `${key} ${incatStr()}` ] : []; | |||
// PASS | // PASS 3: Prefix auf Titel | ||
if (hints.names.length) | const pass3 = []; | ||
if (hints.names.length) pass3.push(hints.names[0]); | |||
if (!pass3.length && hints.words.length) pass3.push(hints.words[0]); | |||
const seen = new Set(); const out = []; | const seen = new Set(); const out = []; | ||
async function | |||
async function runSr (q) { | |||
const r = await api.get({ action:'query', list:'search', srsearch:q, srnamespace: ns0, srlimit: MAX, formatversion:2 }); | const r = await api.get({ action: 'query', list: 'search', srsearch: q, srnamespace: ns0, srlimit: MAX, formatversion: 2 }); | ||
(r.query?.search || []).forEach(it => { if ( | (r.query?.search || []).forEach(it => { | ||
const k = it.title; | |||
if (seen.has(k)) return; | |||
seen.add(k); | |||
out.push(it); | |||
}); | |||
} | } | ||
for (const q of | |||
await | for (const q of pass1) { await runSr(q); if (out.length >= MAX) return out.slice(0, MAX); } | ||
for (const q of pass2) { await runSr(q); if (out.length >= MAX) return out.slice(0, MAX); } | |||
// Prefix (list=prefixsearch) | |||
for (const p of pass3) { | |||
const r = await api.get({ action: 'query', list: 'prefixsearch', pssearch: p, psnamespace: ns0, pslimit: MAX }); | |||
(r.query?.prefixsearch || []).forEach(it => { | |||
const title = it.title || it['*']; | |||
const k = title; | |||
if (seen.has(k)) return; | |||
seen.add(k); | |||
out.push({ title, snippet: '' }); | |||
}); | |||
if (out.length >= MAX) break; | if (out.length >= MAX) break; | ||
} | } | ||
return out.slice(0, MAX); | return out.slice(0, MAX); | ||
} | } | ||
function | // ganz einfacher Fuzzy-Fallback auf Suchergebnissen | ||
function scoreTitle(title, hints){ | |||
const t = String(title||'').toLowerCase(); | |||
let s = 0; | |||
hints.names.forEach(n => { if (t.includes(n.toLowerCase())) s += 1.0; }); | |||
hints.words.forEach(n => { if (t.includes(n.toLowerCase())) s += 0.4; }); | |||
hints.ages.forEach(a => { if (t.includes(String(a))) s += 0.4; }); | |||
hints.years.forEach(y => { if (t.includes(String(y))) s += 0.4; }); | |||
return s; | |||
} | |||
async function fallbackFuzzyTitles(hints, limit){ | |||
await mw.loader.using('mediawiki.api'); | |||
const api = new mw.Api(); | |||
const MAX = limit || 12; | |||
// Breite Suche mit Tokens (mit/ohne Kategorie) | |||
const q1 = [] | |||
.concat(hints.names.slice(0,2), hints.ages.slice(0,1), hints.years.slice(0,1), hints.words.slice(0,3)) | |||
.map(x => `"${x}"`).join(' '); | |||
const q = `${q1} ${incatStr()}`.trim(); | |||
const r = await api.get({ action:'query', list:'search', srsearch:q || hints.raw.split(/\s+/).slice(0,6).join(' '), srlimit:50, formatversion:2 }); | |||
const items = (r.query?.search || []); | |||
const scored = items.map(it => ({ ...it, _score: scoreTitle(it.title, hints) })); | |||
scored.sort((a,b)=> b._score - a._score); | |||
const top = scored.slice(0, MAX).filter(x=> x._score >= 0.10); // großzügiger | |||
return top; | |||
} | |||
async function broadSearchNoCategory(hints, limit){ | |||
await mw.loader.using('mediawiki.api'); | |||
const api = new mw.Api(); | |||
const MAX = limit || 12; | |||
const parts = [] | |||
.concat(hints.names.slice(0,2), hints.ages.slice(0,1), hints.years.slice(0,1), hints.words.slice(0,3)) | |||
.map(x => `"${x}"`); | |||
const q = parts.length ? parts.join(' ') : hints.raw.split(/\s+/).slice(0,6).join(' '); | |||
const r = await api.get({ action:'query', list:'search', srsearch:q, srlimit:MAX, formatversion:2 }); | |||
return (r.query?.search || []); | |||
} | |||
// ============================= | |||
// Ergebnisse rendern | |||
// ============================= | |||
function renderResults (items) { | function renderResults (items) { | ||
var box = document.getElementById('ados-scan-results'); | |||
if (!box) return; | if (!box) return; | ||
box.innerHTML = ''; | box.innerHTML = ''; | ||
| Zeile 348: | Zeile 386: | ||
return; | return; | ||
} | } | ||
items.slice(0, 12).forEach(it | items.slice(0, 12).forEach(function (it) { | ||
var title = it.title || ''; | |||
var link = mw.util.getUrl(title.replace(/ /g, '_')); | |||
var snip = String(it.snippet || '').replace(/<\/?span[^>]*>/g, '').replace(/"/g, '"'); | |||
var div = document.createElement('div'); | |||
div.className = 'ados-hit'; | div.className = 'ados-hit'; | ||
div.innerHTML = | div.innerHTML = | ||
'<b><a href="'+link+'">'+esc(title)+'</a></b>' + | '<b><a href="' + link + '">' + esc(title) + '</a></b>' + | ||
(snip ? '<div class="meta">'+snip+'</div>' : ''); | (snip ? '<div class="meta">' + snip + '</div>' : ''); | ||
box.appendChild(div); | box.appendChild(div); | ||
}); | }); | ||
} | } | ||
// ========= | // ============================= | ||
// Binding | |||
function bind() { | // ============================= | ||
var BOUND = false; | |||
function bind () { | |||
if (BOUND || !hasUI()) return; | if (BOUND || !hasUI()) return; | ||
var runBtn = document.getElementById('ados-scan-run'); | |||
var fileIn = document.getElementById('ados-scan-file'); | |||
var bigBtn = document.getElementById('ados-scan-bigbtn'); | |||
var form = document.getElementById('ados-scan-form'); | |||
if (!runBtn || !fileIn) return; | if (!runBtn || !fileIn) return; | ||
BOUND=true; | if (runBtn.dataset.bound === '1') return; | ||
runBtn.dataset.bound = '1'; BOUND = true; | |||
if (bigBtn) bigBtn.addEventListener('click', () | if (bigBtn) bigBtn.addEventListener('click', function () { fileIn.click(); }); | ||
fileIn.addEventListener('change', function(){ | fileIn.addEventListener('change', function () { | ||
if (this.files && this.files[0]) showPreview(this.files[0]); | if (this.files && this.files[0]) showPreview(this.files[0]); | ||
}); | }); | ||
function onSubmit(ev){ | |||
ev.preventDefault(); | ev.preventDefault(); | ||
if (!(fileIn.files && fileIn.files[0])) { alert('Bitte ein Foto auswählen oder aufnehmen.'); return; } | if (!(fileIn.files && fileIn.files[0])) { alert('Bitte ein Foto auswählen oder aufnehmen.'); return; } | ||
try { | var f = fileIn.files[0]; | ||
(async function(){ | |||
try { | |||
runBtn.disabled = true; runBtn.textContent = 'Erkenne …'; | |||
setStatus('Erkenne Label …'); | |||
const text = await runOCR(f); | |||
showOCRText(text); | |||
setStatus('Suche im Wiki …'); | |||
const hints = extractHints(text); | |||
let hits = await searchWikiSmart(hints, 12); | |||
if (!hits || !hits.length) { | |||
setStatus('Kein direkter Treffer – Fuzzy über Kategorien …'); | |||
hits = await fallbackFuzzyTitles(hints, 12); | |||
} | |||
if (!hits || !hits.length) { | |||
setStatus('Kein Treffer – breite Suche ohne Kategorien …'); | |||
} | hits = await broadSearchNoCategory(hints, 12); | ||
}); | } | ||
renderResults(hits); | |||
setStatus('Fertig.'); | |||
} catch (e) { | |||
console.error('[LabelScan]', e); | |||
setStatus('Fehler bei Erkennung/Suche. Bitte erneut versuchen.'); | |||
} finally { | |||
runBtn.disabled = false; runBtn.textContent = 'Erkennen & suchen'; | |||
} | |||
})(); | |||
} | |||
runBtn.addEventListener('click', onSubmit); | |||
if (form) form.addEventListener('submit', onSubmit); | |||
// Sicherheit | |||
var wrap = document.getElementById('ados-labelscan'); | |||
if (wrap) wrap.style.position = 'relative'; | |||
runBtn.style.position = 'relative'; | |||
runBtn.style.zIndex = '9999'; | |||
runBtn.style.pointerEvents = 'auto'; | |||
} | } | ||
if (document.readyState === 'loading') document.addEventListener('DOMContentLoaded', bind); | if (document.readyState === 'loading') { | ||
else bind(); | document.addEventListener('DOMContentLoaded', bind); | ||
setTimeout(bind, 250); setTimeout(bind, 1000); | } else { | ||
new MutationObserver(() | bind(); | ||
} | |||
setTimeout(bind, 250); | |||
setTimeout(bind, 1000); | |||
var mo = new MutationObserver(function () { if (!BOUND) bind(); }); | |||
mo.observe(document.documentElement || document.body, { childList: true, subtree: true }); | |||
})(); | })(); | ||