User:Proteins/articlestructure.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Proteins/articlestructure. |
//<pre>
// Analyze the article's structure
// with kind respects to Dr. pda, whose excellent prosesizebytes.js script was the inspiration
//
// To use this script, add "importScript('User:Proteins/articlestructure.js');" to your monobook.js subpage
// under your user page, as you can see at User:Proteins/monobook.js
function articleStructure() {
var alert_string = "";
var diagnostic_string = "";
var read_entire_article = true;
var show_lead_diagnostics = true;
var show_section_diagnostics = false;
var display_individual_words = false;
var using_Internet_Explorer = false;
var spaced_text = "";
var untagged_text = "";
var stripped_text = "";
var unescaped_text = "";
var anchors;
var temp_anchor;
var section_name = "";
var temp_anchor_name = "";
var num_anchors = 0;
var anchor_index = 0;
var anchor_level = 0;
var prev_anchor_level = 0;
var num_H2_anchors = 0;
var H2_anchor_index = 0;
var cutoff_anchor_index = 0;
var cutoff_H2_anchor_index = 0;
var cutoff_child_node_index = 0;
var last_P_child_node_index = 0;
var cutoff_element_node_index = 0;
var num_sections = 0;
var section_index = 0;
var element_node;
var num_element_nodes = 0;
var element_node_index = 0;
var temp_node_name = "";
var parent_node;
var grandparent_node;
var greatgrandparent_node;
var sibling_node;
var next_sibling_node;
var child_node;
var child_nodes;
var prev_child_node;
var num_child_nodes = 0;
var child_node_index = 0;
var child_node_name = "";
var num_prose_counted_nodes = 0;
var grandchild_node;
var grandchild_nodes;
var num_grandchild_nodes = 0;
var grandchild_node_index = 0;
var path_names;
var file_name = "";
var num_characters = 0;
var del_num_characters = 0;
var temp_num_characters = 0;
var temp_word = "";
var num_words = 0;
var word_count = 0;
var word_index = 0;
var nonempty_word_index = 0;
var tentative_num_words = 0;
var num_spaces = 0;
var paragraph_count = 0;
var list_item_count = 0;
var prose_size_bytes = 0;
var total_word_count = 0;
var total_paragraph_count = 0;
var total_list_item_count = 0;
var total_prose_size_bytes = 0;
var section_word_count = new Array();
var section_paragraph_count = new Array();
var section_list_item_count = new Array();
var section_prose_size_bytes = new Array();
var word_count_string = "";
var paragraph_count_string = "";
var list_item_count_string = "";
var prose_size_bytes_string = "";
var temp_paragraph;
var text_paragraphs;
var num_paragraphs = 0;
var paragraph_index = 0;
var temp_list_item;
var text_list_items;
var num_list_items = 0;
var list_item_index = 0;
var temp_image;
var num_pixels = 0;
var image_index = 0;
var image_counter = 0;
var num_raw_images = 0;
var num_nonicon_images = 0;
var num_anchors = 0;
var num_raw_links = 0;
var num_raw_tables = 0;
var num_raw_references = 0;
// check for Internet Explorer browser
using_Internet_Explorer = false;
if (navigator.userAgent.indexOf("MSIE") > -1) {
using_Internet_Explorer = true;
// alert_string = "This script works correctly in every browser — except Internet Explorer. Please be patient!"
// window.alert(alert_string);
}
// Find the cutoff H2 anchor index, where we stop counting things
alert_string = "";
num_H2_anchors = 0;
section_name = "lead section";
prev_anchor_level = 1; //begin at the H1 heading
read_entire_article = true;
anchors = document.anchors;
num_anchors = anchors.length;
for (anchor_index=1; anchor_index<num_anchors; anchor_index++) {
temp_anchor = anchors[anchor_index];
parent_node = temp_anchor.parentNode;
if (!parent_node) { continue; }
sibling_node = parent_node.nextSibling;
if (!sibling_node) { continue; }
// Check headings for jumps upwards in heading level
anchor_level = 0;
if (sibling_node.nodeName == "H1") {
alert_string += " WARNING: Illegal H1 heading in this section\n";
} else if (sibling_node.nodeName == "H2") {
anchor_level = 2;
} else if (sibling_node.nodeName == "H3") {
anchor_level = 3;
} else if (sibling_node.nodeName == "H4") {
anchor_level = 4;
} else if (sibling_node.nodeName == "H5") {
anchor_level = 5;
} else {
next_sibling_node = sibling_node.nextSibling;
if (!next_sibling_node) { continue; }
// Check headings for jumps upwards in heading level
if (next_sibling_node.nodeName == "H1") {
alert_string += " WARNING: Illegal H1 heading in this section\n";
} else if (next_sibling_node.nodeName == "H2") {
anchor_level = 2;
} else if (next_sibling_node.nodeName == "H3") {
anchor_level = 3;
} else if (next_sibling_node.nodeName == "H4") {
anchor_level = 4;
} else if (next_sibling_node.nodeName == "H5") {
anchor_level = 5;
}
} // closes assignment of the anchor level, if any
if (((anchor_level - prev_anchor_level) > 1) && (prev_anchor_level != 0)) {
if (num_H2_anchors == 0) {
alert_string += " WARNING: H" + prev_anchor_level + " to H" + anchor_level + " jump in the lead\n";
} else {
alert_string += " WARNING: H" + prev_anchor_level + " to H" + anchor_level + " jump in \"" + section_name.replace(/(_+)/ig, " ") + "\"\n";
}
}
if (anchor_level > 0) { prev_anchor_level = anchor_level; }
//Check major section headings for closing sections
if (anchor_level == 2) {
num_H2_anchors++;
section_name = temp_anchor.name;
temp_anchor_name = temp_anchor.name;
alert_string += "Section " + num_H2_anchors + " : " + section_name.replace(/(_+)/ig, " ") + "\n";
// alert_string += "Section " + num_H2_anchors + " : " + section_name.replace(/(_+)/ig, " ") + " " + temp_anchor.parentNode.nodeName + " " + sibling_node.nodeName + "\n";
temp_anchor_name = temp_anchor_name.replace(/:$/ig,""); // eliminate colons at end
temp_anchor_name = temp_anchor_name.replace(/s$/ig,""); // eliminate plurals at end
temp_anchor_name = temp_anchor_name.replace(/See_also/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Related_topic/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Related_article/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Further_reading/ig,"");
temp_anchor_name = temp_anchor_name.replace(/External_link/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Footnote/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Note/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Reference/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Citation/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Source/ig,"");
temp_anchor_name = temp_anchor_name.replace(/Link/ig,"");
temp_anchor_name = temp_anchor_name.replace(/s([_\s]+)and([_\s]+)/ig,"");
temp_anchor_name = temp_anchor_name.replace(/([_\s]+)and([_\s]+)/ig,"");
temp_anchor_name = temp_anchor_name.replace(/([_\s]+)/ig,"");
if (temp_anchor_name == "") { break; }
// diagnostic_string = "Section " + num_H2_anchors + " : " + temp_anchor_name + " L: " + temp_anchor_name.length;
// window.alert(diagnostic_string);
} // closes check for H2 anchor
} // closes loop over the anchors
cutoff_anchor_index = anchor_index;
cutoff_H2_anchor_index = num_H2_anchors;
if (cutoff_anchor_index < num_anchors) {
read_entire_article = false;
alert_string += "\nProse counting will stop before the \"" + temp_anchor.name.replace(/(_+)/ig, " ") + "\" section.\n";
} else {
read_entire_article = true;
alert_string += "\nProse counting will cover the entire article.\n";
}
window.alert(alert_string);
// Count child and element nodes
alert_string = "";
num_element_nodes = 0;
child_nodes = document.getElementById("bodyContent").childNodes;
num_child_nodes = child_nodes.length;
// if (num_child_nodes > 40) { num_child_nodes = 40;} // truncate loop for testing
for (child_node_index=0; child_node_index < num_child_nodes; child_node_index++) {
child_node = child_nodes[child_node_index];
if (child_node.nodeType != 1) {
// alert_string += "Child node " + child_node_index + " : " + child_node.nodeName + "\n";
continue;
} // examine only Element nodes
num_element_nodes++;
// alert_string += "Element node " + num_element_nodes + " : " + child_node.nodeName + "\n";
} // closes loop counting the element nodes
// window.alert(alert_string);
// Determine the corresponding childNode index cutoff
alert_string = "";
if (read_entire_article == true) {
cutoff_child_node_index = num_child_nodes;
cutoff_element_node_index = num_element_nodes;
} else {
H2_anchor_index = 0;
element_node_index = 0;
last_P_child_node_index = -1;
last_P_element_node_index = -1;
for (child_node_index=0; child_node_index < num_child_nodes; child_node_index++) {
child_node = child_nodes[child_node_index];
if (child_node.nodeType != 1) { continue; } // examine only Element nodes
element_node_index++;
if (child_node.nodeName == "P") {
last_P_child_node_index = child_node_index;
last_P_element_node_index = num_element_nodes;
} else if (child_node.nodeName == "H2") {
H2_anchor_index++;
if (H2_anchor_index == cutoff_H2_anchor_index) {
cutoff_child_node_index = last_P_child_node_index;
cutoff_element_node_index = last_P_element_node_index;
break;
}
}
// alert_string += "Section " + H2_anchor_index + ", Element node " + num_element_nodes + " : " + child_node.nodeName + " " + child_node.childNodes.length + "\n";
// if (num_element_nodes > 45) { break; } // for debugging
} // closes loop over the childNodes of the Document
if (last_P_child_node_index < 0) { // if no cutoff was discovered; should never happen
cutoff_child_node_index = num_child_nodes;
cutoff_element_node_index = num_element_nodes;
}
} // closes check whether to read entire article
alert_string = "\nThe child_node_index and element_node_index cutoffs are " + cutoff_child_node_index + " and " + cutoff_element_node_index + ", respectively.\n";
// window.alert(alert_string);
// Count the words, paragraphs and prose size bytes by section
word_count = 0;
paragraph_count = 0;
list_item_count = 0;
prose_size_bytes = 0;
num_prose_counted_nodes = 0;
H2_anchor_index = 0;
for (child_node_index=0; child_node_index < cutoff_child_node_index; child_node_index++) {
child_node = child_nodes[child_node_index];
if (child_node.nodeType != 1) { continue; } // examine only Element nodes
element_node_index++;
if (child_node.nodeName == "H2") {
section_word_count.push(word_count);
section_paragraph_count.push(paragraph_count);
section_list_item_count.push(list_item_count);
section_prose_size_bytes.push(prose_size_bytes);
H2_anchor_index++;
word_count = 0;
paragraph_count = 0;
list_item_count = 0;
prose_size_bytes = 0;
}
// if the child node meets the criteria, add to the prose size, word and paragraph counts
if ((child_node.nodeName == "P") || (child_node.nodeName == "PRE")) {
untagged_text = child_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
// spaced_text = filterStringForProseSizeCounting(untagged_text);
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the paragraph contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
paragraph_count++;
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + " " + temp_num_characters + "\n";
}
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of paragraph " + paragraph_count + " of Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty paragraph
} // tentative check for words
} else if ((child_node.nodeName == "UL") || (child_node.nodeName == "OL")) { // unordered and ordered lists
grandchild_nodes = child_node.childNodes; // not all LI elements because of possible nesting
num_grandchild_nodes = grandchild_nodes.length;
for (grandchild_node_index=0; grandchild_node_index<num_grandchild_nodes; grandchild_node_index++) {
grandchild_node = grandchild_nodes[grandchild_node_index];
if (grandchild_node.nodeName == "LI") {
untagged_text = grandchild_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the list item contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
list_item_count++;
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", List item " + list_item_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + "\n";
}
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of list item " + list_item_count + " of Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty list item
} // tentative check for words
} // closes check for a list item (LI) node
} // closes loop over grandchild nodes of an ordered (OL) or unordered (UL) list
} else if (child_node.nodeName == "DL") { // discursive lists
grandchild_nodes = child_node.childNodes;
num_grandchild_nodes = grandchild_nodes.length;
for (grandchild_node_index=0; grandchild_node_index<num_grandchild_nodes; grandchild_node_index++) {
grandchild_node = grandchild_nodes[grandchild_node_index];
if ((grandchild_node.nodeName == "DT") || (grandchild_node.nodeName == "DD")) {
// Exceptions that shouldn't be counted
if (grandchild_node.childNodes.length > 0) {
temp_node_name = grandchild_node.childNodes[0].nodeName;
if ((temp_node_name == "DIV") || (temp_node_name == "SPAN")) { continue; }
}
if (grandchild_node.childNodes.length > 1) {
temp_node_name = grandchild_node.childNodes[1].nodeName;
if ((temp_node_name == "DIV") || (temp_node_name == "SPAN")) { continue; }
}
untagged_text = grandchild_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the list item contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
list_item_count++;
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", List item " + list_item_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + "\n"; }
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of discursive list item " + list_item_count + " of Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty list item
} // tentative check for words
} // closes check for a discursive list item (DT or DD) node
} // closes loop over grandchild nodes of a discursive list DL
} else if (child_node.nodeName == "BLOCKQUOTE") {
grandchild_nodes = child_node.getElementsByTagName("P");
num_grandchild_nodes = grandchild_nodes.length;
for (grandchild_node_index=0; grandchild_node_index<num_grandchild_nodes; grandchild_node_index++) {
grandchild_node = grandchild_nodes[grandchild_node_index];
if (grandchild_node.nodeName == "P") {
untagged_text = grandchild_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the list item contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
// don't count blockquotes, for now
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + "\n";
}
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of BLOCKQUOTE in Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty list item
} // tentative check for words
} // closes check for a paragraph (P) node in a BLOCKQUOTE
} // closes loop over grandchild nodes in a BLOCKQUOTE
} else if (child_node.nodeName == "TABLE") {
if (child_node.className != "cquote") { continue; } // count only tables that are cquotes
grandchild_nodes = child_node.getElementsByTagName("TD");
num_grandchild_nodes = grandchild_nodes.length;
for (grandchild_node_index=0; grandchild_node_index<num_grandchild_nodes; grandchild_node_index++) {
grandchild_node = grandchild_nodes[grandchild_node_index];
if (grandchild_node.nodeName == "TD") {
untagged_text = grandchild_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the list item contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
// don't count cquotes, for now
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + "\n";
}
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of CQUOTE paragraph in Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty list item
} // tentative check for words
} // closes check for a paragraph (P) node in a CQUOTE
} // closes loop over grandchild nodes in a CQUOTE
} else if (child_node.nodeName == "DIV") { // Poems
if (child_node.className != "poem") { continue; } // allow only poem DIV's
grandchild_nodes = child_node.getElementsByTagName("P");
num_grandchild_nodes = grandchild_nodes.length;
for (grandchild_node_index=0; grandchild_node_index<num_grandchild_nodes; grandchild_node_index++) {
grandchild_node = grandchild_nodes[grandchild_node_index];
if (grandchild_node.nodeName == "P") {
untagged_text = grandchild_node.innerHTML;
untagged_text = untagged_text.replace(/<sup>/ig,""); // keep simple superscript text
untagged_text = untagged_text.replace(/(<sup([^>]+)>)(.*?<\/sup>)/ig,""); // remove superscript text
untagged_text = untagged_text.replace(/(<([^>]+)>)/ig,""); // remove remaining tags
untagged_text = untagged_text.replace(/>/ig, ">"); // convert > to a single character >
untagged_text = untagged_text.replace(/</ig, "<"); // convert < to a single character <
untagged_text = untagged_text.replace(/&/ig, "&"); // convert & to a single character &
untagged_text = untagged_text.replace(/—/ig, ", "); // replace em-dashes with comma+space
spaced_text = untagged_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/ /ig, " "); // convert non-breaking spaces to spaces
spaced_text = spaced_text.replace(/\s+/ig, " "); // convert all whitespace to a single space
words = spaced_text.split(' ');
tentative_num_words = words.length;
if (tentative_num_words > 0) { // verify that the list item contributes text
num_words = 0;
num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
num_words++;
num_characters += del_num_characters;
}
}
if (num_words > 0) {
// don't count blockquotes, for now
num_prose_counted_nodes++;
word_count += num_words;
prose_size_bytes += num_characters;
num_spaces = num_words - 1;
prose_size_bytes += num_spaces; // add spaces to character count
child_node.style.cssText = "background-color:yellow";
// Code for testing output
if ((!show_section_diagnostics) && ((!show_lead_diagnostics) || (H2_anchor_index != 0))) {
continue;
}
diagnostic_string = "";
nonempty_word_index = 0;
temp_num_characters = 0;
for (word_index=0; word_index<tentative_num_words; word_index++) {
if ((word_index%45 == 1) && (word_index>45) && (display_individual_words)) {
window.alert(diagnostic_string);
diagnostic_string = "Continued from previous screen:\n\n";
}
temp_word = words[word_index];
del_num_characters = temp_word.length;
if (del_num_characters > 0) {
nonempty_word_index++;
temp_num_characters += del_num_characters;
diagnostic_string += "Section " + H2_anchor_index + ", Paragraph " + paragraph_count + ", Word " + nonempty_word_index + " : " + temp_word + " " + del_num_characters + "\n";
}
}
temp_num_characters += num_spaces;
diagnostic_string += "Added " + num_spaces + " spaces to the byte count.\n\n";
if (display_individual_words) {
diagnostic_string += "\nEND of <poem> in Section " + H2_anchor_index + ": character count = " + temp_num_characters + " total= " + prose_size_bytes + "\n";
window.alert(diagnostic_string);
}
} // closes check for non-empty list item
} // tentative check for words
} // closes check for a paragraph (P) node in a poem
} // closes loop over grandchild nodes in a poem
} // closes check for appropriate elements
} // closes loop over the child nodes
section_word_count.push(word_count);
section_paragraph_count.push(paragraph_count);
section_list_item_count.push(list_item_count);
section_prose_size_bytes.push(prose_size_bytes);
// Output the various counts
word_count_string = " word";
paragraph_count_string = " paragraph";
list_item_count_string = " list item";
prose_size_bytes_string = " byte";
if (section_word_count[0] != 1) { word_count_string += "s";}
if (section_paragraph_count[0] != 1) { paragraph_count_string += "s";}
if (section_list_item_count[0] != 1) { list_item_count_string += "s";}
if (section_prose_size_bytes[0] != 1) { prose_size_bytes_string += "s";}
alert_string = "Lead section: " + section_paragraph_count[0] + paragraph_count_string + ", " + section_list_item_count[0] + list_item_count_string + ", " + section_word_count[0] + word_count_string + ", " + section_prose_size_bytes[0] + prose_size_bytes_string + "\n\n";
total_word_count = section_word_count[0];
total_paragraph_count = section_paragraph_count[0];
total_list_item_count = section_list_item_count[0];
total_prose_size_bytes = section_prose_size_bytes[0];
num_sections = section_word_count.length;
for (section_index=1; section_index<num_sections; section_index++) {
total_word_count += section_word_count[section_index];
total_paragraph_count += section_paragraph_count[section_index];
total_list_item_count += section_list_item_count[section_index];
total_prose_size_bytes += section_prose_size_bytes[section_index];
word_count_string = " word";
paragraph_count_string = " paragraph";
list_item_count_string = " list item";
prose_size_bytes_string = " byte";
if (section_word_count[section_index] != 1) { word_count_string += "s";}
if (section_paragraph_count[section_index] != 1) { paragraph_count_string += "s";}
if (section_list_item_count[section_index] != 1) { list_item_count_string += "s";}
if (section_prose_size_bytes[section_index] != 1) { prose_size_bytes_string += "s";}
alert_string += "Section " + section_index + " : " + section_paragraph_count[section_index] + paragraph_count_string + ", " + section_list_item_count[section_index] + list_item_count_string + ", " + section_word_count[section_index] + word_count_string + ", " + section_prose_size_bytes[section_index] + prose_size_bytes_string + "\n";
}
if (num_sections>1) {alert_string += "\n";} // Make space for the totals
word_count_string = " word";
paragraph_count_string = " paragraph";
list_item_count_string = " list item";
prose_size_bytes_string = " byte";
if (total_word_count != 1) { word_count_string += "s";}
if (total_paragraph_count != 1) { paragraph_count_string += "s";}
if (total_list_item_count != 1) { list_item_count_string += "s";}
if (total_prose_size_bytes != 1) { prose_size_bytes_string += "s";}
alert_string += "Totals: " + total_paragraph_count + paragraph_count_string + ", " + total_list_item_count + list_item_count_string + ", " + total_word_count + word_count_string + ", " + total_prose_size_bytes + prose_size_bytes_string + "\n";
window.alert(alert_string);
// Count the article images
num_nonicon_images = 0;
num_raw_images = document.images.length;
alert_string = "This document has " + num_raw_images + " images.\n";
for (image_index=0; image_index<num_raw_images; image_index++) {
temp_image = document.images[image_index];
parent_node = temp_image.parentNode;
grandparent_node = parent_node.parentNode;
greatgrandparent_node = grandparent_node.parentNode;
num_pixels = temp_image.width * temp_image.height;
if (temp_image.src.match(/Replace_this_image_male\.svg/)) { continue; }
if (temp_image.src.match(/Replace_this_image_female\.svg/)) { continue; }
if (num_pixels > 5000) { num_nonicon_images++; }
}
if (num_nonicon_images == 1) {
alert_string = "This document has 1 image with more than 5000 pixels.\n\n";
} else {
alert_string = "This document has " + num_nonicon_images + " images with more than 5000 pixels.\n\n";
}
image_counter = 0;
for (image_index=0; image_index<num_raw_images; image_index++) {
temp_image = document.images[image_index];
parent_node = temp_image.parentNode;
grandparent_node = parent_node.parentNode;
greatgrandparent_node = grandparent_node.parentNode;
num_pixels = temp_image.width * temp_image.height;
if (temp_image.src.match(/Replace_this_image_male\.svg/)) { continue; }
if (temp_image.src.match(/Replace_this_image_female\.svg/)) { continue; }
if (num_pixels < 5001) { continue; }
image_counter++;
alert_string += image_counter + " " + temp_image.width + "x" + temp_image.height + " " + num_pixels + " ";
path_names = temp_image.src.split("/");
file_name = path_names.pop();
file_name = file_name.replace(/^(\d+)px-/, "");
alert_string += file_name + "\n";
}
window.alert(alert_string);
return;
// Count the article tables and check for infoboxes and navigation templates
num_raw_tables = document.getElementsByTagName("table").length;
// Check for className = "infobox vcard" or "navbox-group"
alert_string = "This document has " + num_raw_tables + " tables.\n";
window.alert(alert_string);
// Count the article references
num_raw_references = document.getElementsByTagName("li").length;
// Count the article interwikis
num_raw_interwikis = document.getElementsByTagName("li").length;
// Count the article categories
num_raw_categories = document.getElementsByTagName("table").length;
// Count the article anchors; for each anchor...
alert_string = "This document has " + document.anchors.length + " anchors:\n";
for (anchor_index=0; anchor_index<document.anchors.length; anchor_index++) {
temp_anchor = document.anchors[anchor_index];
alert_string += "Name " + anchor_index + ": " + temp_anchor.name + "\n";
}
window.alert(alert_string);
} // closes function articleStructure()
addOnloadHook(function () {
mw.util.addPortletLink('p-cactions', 'javascript:articleStructure()', 'structure', 'ca-structure', 'Structure of the article', 'g', '');
});
//</pre>