User:Polygnotus/Scripts/DeduplicateReferences.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Polygnotus/Scripts/DeduplicateReferences. |
// <nowiki>
//Only exact duplicates
//Tries to come up with a name for the reference
// Function to deduplicate references in Wikipedia articles
function deduplicateReferences() {
// Get the edit textarea and summary input
const editTextarea = document.getElementById('wpTextbox1');
const summaryInput = document.getElementById('wpSummary');
if (!editTextarea || !summaryInput) return;
let content = editTextarea.value;
// Regular expression to match <ref> tags
const refRegex = /<ref[^>]*>[\s\S]*?<\/ref>/gi;
// Object to store all references
const allRefs = {};
// Set to store all used reference names
const usedNames = new Set();
// Blacklist of reference names to ignore
const blacklist = [
"doi_org",
"jstor_org",
"amazon_com",
"books_google_com",
"web_archive_org",
"worldcat_org",
"dx_doi_org"
// Add more blacklisted names here
];
// Function to extract domain name from URL
function extractDomain(url) {
try {
let domain = new URL(url).hostname;
domain = domain.replace(/^www\./, ''); // Remove 'www.' if present
return domain === 'archive.org' ? extractDomain(url.split('archive.org/web/')[1]) : domain;
} catch (e) {
return null;
}
}
// Function to generate a unique name for the reference
function generateUniqueName(ref) {
const urlMatch = ref.match(/https?:\/\/[^\s<>"]+/i);
if (urlMatch) {
const domain = extractDomain(urlMatch[0]);
if (domain) {
let baseName = domain.replace(/\./g, '_');
let uniqueName = baseName;
let counter = 1;
while (usedNames.has(uniqueName)) {
uniqueName = `${baseName}_${counter}`;
counter++;
}
usedNames.add(uniqueName);
return uniqueName;
}
}
return null;
}
// Function to extract existing name from a reference
function extractExistingName(ref) {
const nameMatch = ref.match(/name\s*=\s*(["']?)([^"'\s/>]+(?:\s+[^"'\s/>]+)*)\1/i);
return nameMatch ? nameMatch[2] : null;
}
// Function to create a reference tag
function createRefTag(name, content = null) {
if (content) {
return `<ref name="${name}">${content}</ref>`;
} else {
return `<ref name="${name}" />`;
}
}
// Function to check if a reference is blacklisted
function isBlacklisted(ref) {
const name = extractExistingName(ref);
return name && blacklist.includes(name);
}
// First pass: collect all references and used names
content.replace(refRegex, (match) => {
if (!isBlacklisted(match)) {
const existingName = extractExistingName(match);
if (existingName) {
usedNames.add(existingName);
}
if (allRefs[match]) {
allRefs[match].count++;
} else {
allRefs[match] = { count: 1, name: existingName, firstOccurrence: match };
}
}
return match;
});
// Second pass: replace duplicates with named references
let deduplicatedCount = 0;
content = content.replace(refRegex, (match) => {
if (isBlacklisted(match)) {
return match; // Return blacklisted references unchanged
}
if (allRefs[match] && allRefs[match].count > 1) {
if (!allRefs[match].name) {
// This is a duplicate without a name
const generatedName = generateUniqueName(match);
if (generatedName && !blacklist.includes(generatedName)) {
allRefs[match].name = generatedName;
allRefs[match].firstOccurrence = createRefTag(generatedName, match.match(/<ref[^>]*>([\s\S]*)<\/ref>/)[1]);
return allRefs[match].firstOccurrence;
}
} else {
// This is a named reference
if (match === allRefs[match].firstOccurrence) {
// This is the first occurrence, keep it as is
return match;
} else {
// This is a subsequent occurrence, replace with short form
deduplicatedCount++;
return createRefTag(allRefs[match].name);
}
}
}
return match; // Return unchanged for non-duplicates or blacklisted references
});
// Update the textarea with the deduplicated content
if (deduplicatedCount > 0) {
editTextarea.value = content;
// Add edit summary
let currentSummary = summaryInput.value;
let deduplicationSummary = `Deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''}`;
summaryInput.value = currentSummary ? `${currentSummary} • ${deduplicationSummary}` : deduplicationSummary;
document.editform.wpMinoredit.checked = true;
}
}
// Function to check if the edit textarea is ready
function isEditTextareaReady() {
const editTextarea = document.getElementById('wpTextbox1');
const summaryInput = document.getElementById('wpSummary');
return editTextarea && editTextarea.value && summaryInput;
}
// Function to run deduplication when everything is ready
function runDeduplicationWhenReady() {
if (isEditTextareaReady()) {
deduplicateReferences();
} else {
// If not ready, check again after a short delay
setTimeout(runDeduplicationWhenReady, 100);
}
}
// Run the deduplication when the edit page is fully loaded
if (mw.config.get('wgAction') === 'edit') {
if (document.readyState === 'complete') {
runDeduplicationWhenReady();
} else {
window.addEventListener('load', runDeduplicationWhenReady);
}
}
// </nowiki>