<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ page trimDirectiveWhitespaces="true" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Feature" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Document" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Apparatus.Segment" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Page" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ShelfInstance" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ProjectManagement" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ProjectManagement.Project" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Transcription" %> <%@ page import="org.crosswire.xml.XMLBlock" %> <%@ page import="org.crosswire.sword.keys.ListKey" %> <%@ page import="org.crosswire.sword.keys.VerseKey" %> <%@ page import="java.util.List" %> <%@ page import="java.util.Collections" %> <%@ page import="java.util.Arrays" %> <%@ page import="java.util.List" %> <%@ page import="java.util.ArrayList" %> <%@ page import="org.apache.log4j.Logger" %> <%@ page import="org.crosswire.webtools.annotation.*" %> <%@ page import="org.crosswire.webtools.*" %> <%! @Description(value = "Perform sanity checks on various parts of the dataset", name = "statistics/integritycheck") public static class MyParameters extends Parameters { protected ProjectManagement.Project project = null; @Description(value = "Shows pages with verseCount > saneLimit", defaultValue = "false", example = "true for default saneLimit of 50 or e.g., 100 for custom limit") public Integer verseMax = null; @Description(value = "Shows documents with pages not assigned to a shelf instance", defaultValue = "false", example = "true") public Boolean pageNoShelf = false; @Description(value = "Shows shelf instances with no assigned pages", defaultValue = "false", example = "true") public Boolean shelfNoPage = false; @Description(value = "Shows descrepencies between index data and transcription", defaultValue = "false", example = "true") public Boolean indexToTranscription = false; @Description(value = "Shows apparatus readings with odd witness count", defaultValue = "false", example = "true") public Boolean appReadingWitnessCount = false; @Description(value = "Shows apparatus readings with zero witness count, excluding 'a' readings", defaultValue = "false", example = "true") public Boolean appReadingWitnessZero = false; @Description(value = "Limit results to docID min", example = "10000") public Integer docIDMin = null; @Description(value = "Limit results to docID max", example = "19999") public Integer docIDMax = null; @Description(value = "Limit checks to a Document Group", example = "1") public Integer documentGroupID; @Description(value = "Limit checks to a Project ID", defaultValue = "1", example = "43") public Integer projectID = 1; @Description(value = "Limit checks to a Project Name", example = "ECM Matthew") public String projectName; @Description(value = "Show this usage help", example = "true", defaultValue = "false") public Boolean help = false; // format(html) | xml @Override protected void afterLoad() { if (verseMax == null && "true".equals(request.getParameter("verseMax"))) verseMax = 50; } @Override protected void customValidation() { projectName = Transcription.assureUnicode(projectName); if (projectID != null || projectName != null) { project = projectID != null ? ProjectManagement.getProject(projectID) : ProjectManagement.getProject(projectName); if (project == null) { addError(-7, "Project not found."); return; } } if (project != null && documentGroupID == null) { documentGroupID = project.getDocumentGroupID(); } } } %> <% MyParameters params = new MyParameters().loadFromRequest(request, response, false); if (params.getErrors().size() == 0 && !params.help) { int checkCount = 0; String format = request.getParameter("format"); if (format == null) format = "html"; boolean html = "html".equals(format); StringBuffer results = new StringBuffer(); String baseURL = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); // + request.getContextPath(); if (params.verseMax != null) { ++checkCount; Page query = new Page(); String sql = "SELECT DOCUMENTID, PAGEID, COUNT(*) VERSECOUNT, MIN(USERID) INDEXER from BIBLICALCONTENT WHERE 1=1"; sql += " AND PAGEID < 99999"; if (params.docIDMin != null) sql += " AND DOCUMENTID >= " + params.docIDMin; if (params.docIDMax != null) sql += " AND DOCUMENTID <= " + params.docIDMax; sql += " GROUP BY DOCUMENTID, PAGEID"; sql += " HAVING VERSECOUNT > "+params.verseMax; sql += " ORDER BY VERSECOUNT DESC"; List rows = query.getDataSet(sql); if (html) { results.append("

Verse Max - potential problems: " + rows.size() + "

"); results.append(""); } else { results.append(""); } for (Page p : rows) { if (html) { results.append(""); results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) results.append("
DocIDPageIDVerse CountIndexerFixMeURL
"+p.getDocumentID()+""+p.getPageID()+""+p.getLongValue("VERSECOUNT")+""+p.getStringValue("INDEXER")+"fix me
"); else results.append(""); } if (params.shelfNoPage) { ++checkCount; ShelfInstance query = new ShelfInstance(); List rows = query.getDataSet("SELECT * from SHELFINSTANCE SI join INSTITUTION I on I.INSTITUTIONID=SI.INSTITUTIONID WHERE not exists(select 1 from SHELFINSTANCEPAGES SIP where SIP.SHELFID=SI.SHELFID) ORDER BY I.COUNTRY, I.PLACE, I.NAME"); if (html) { results.append("

Shelf Instances with no pages assigned - potential problems: " + rows.size() + "

"); results.append(""); } else { results.append(""); } for (ShelfInstance si : rows) { if (html) { results.append(""); results.append(""); results.append(""); } else { XMLBlock b = XMLBlock.createXMLBlock(si.toFormattedXML()); b.setAttribute("fixMeURL", baseURL+"/group/vmr-administration/page-and-image/?instID="+si.getInstitutionID()+"&shelfID="+si.getShelfInstanceID()); results.append(b.toString()); } } if (html) results.append("
ShelfIDCountryPlaceInstitution NameShelf NumberFixMeURL
"+si.getShelfInstanceID()+""+si.getCountry()+""+si.getPlace()+""+si.getInstitutionName()+""+si.getShelfNumber()+"fix me
"); else results.append(""); } if (params.pageNoShelf) { ++checkCount; Document query = new Document(); String sql = "SELECT D.*, count(1) TOTALUNASSIGNEDPAGES, min(PAGEID) MINUNASSIGNEDPAGEID from DOCUMENT D join PAGE P on P.DOCUMENTID=D.DOCUMENTID WHERE not exists(select 1 from SHELFINSTANCEPAGES SIP where SIP.DOCUMENTID=P.DOCUMENTID and SIP.PAGEID=P.PAGEID)"; if (params.docIDMin != null) sql += " AND D.DOCUMENTID >= " + params.docIDMin; if (params.docIDMax != null) sql += " AND D.DOCUMENTID <= " + params.docIDMax; List rows = query.getDataSet(sql + " GROUP BY D.DOCUMENTID ORDER BY TOTALUNASSIGNEDPAGES DESC"); if (html) { results.append("

Documents with pages unassigned to shelf instances - potential problems: " + rows.size() + "

"); results.append(""); } else { results.append(""); } for (Document d : rows) { if (html) { results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) results.append("
DocIDUnassigned Page CountFirst Unassigned PageIDFixMeURL
"+d.getDocumentID()+""+d.getLongValue("TOTALUNASSIGNEDPAGES")+""+d.getLongValue("MINUNASSIGNEDPAGEID")+"fix me
"); else results.append(""); } if (params.indexToTranscription) { ++checkCount; Page query = new Page(); String sql = "SELECT D.*, P.*, count(BC.VERSE) VERSECOUNTBC from DOCUMENT D join PAGE P on P.DOCUMENTID=D.DOCUMENTID right join BIBLICALCONTENT BC on BC.DOCUMENTID=P.DOCUMENTID and BC.PAGEID=P.PAGEID WHERE 1=1"; if (params.docIDMin != null) sql += " AND D.DOCUMENTID >= " + params.docIDMin; if (params.docIDMax != null) sql += " AND D.DOCUMENTID <= " + params.docIDMax; sql += " GROUP BY P.DOCUMENTID, P.PAGEID"; List rows = query.getDataSet(sql); List problems = new ArrayList(); for (Page p : rows) { ListKey ic = p.getIndexContent(); List icVerses = new ArrayList(); ListKey tic = p.getIndexContentFromTranscription(false); List ticVerses = new ArrayList(); for (ic.setPosition(ic.TOP); ic.popError() == 0; ic.increment()) { int ich = new VerseKey(ic).getHashNumber(); if (ich % 1000 != 0 || ich % 1000000 == 0) { icVerses.add(new VerseKey(ic).getHashNumber()); } } for (tic.setPosition(ic.TOP); tic.popError() == 0; tic.increment()) { ticVerses.add(new VerseKey(tic).getHashNumber()); } Collections.sort(icVerses); Collections.sort(ticVerses); boolean same = Arrays.equals(icVerses.toArray(new Integer[0]), ticVerses.toArray(new Integer[0])); if ((ticVerses.size() != 0 && icVerses.size() != 0) && !same) problems.add(p); } if (html) { results.append("

Pages with transcription verses which do not match index content; problems: " + problems.size() + "

"); results.append(""); } else { results.append(""); } for (Page d : problems) { if (html) { results.append(""); results.append(""); results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) results.append("
DocIDPageIDindexContenttranscriptionIndexContentCheck BiblesFixMeURL IndexingFixMeURL Transcription
"+d.getDocumentID()+""+d.getPageID()+""+d.getIndexContent().getShortRangeText()+""+d.getIndexContentFromTranscription(false).getShortRangeText()+"Biblesfix me: indexingfix me: transcription
"); else results.append(""); } if (params.appReadingWitnessCount) { int amax = 10; int zzmax = 10; ++checkCount; Segment query = new Segment(); String sql = "select VERSE, CONTEXTDESCRIPTION, (select count(1) from SEGMENTREADING SR LEFT JOIN SEGMENTREADINGWITNESS SRW on SRW.SEGMENTREADINGID=SR.SEGMENTREADINGID where SR.SEGMENTID=S.SEGMENTID and SR.READINGNAME='a' and SRW.PROJECTID="+params.projectID+") ACOUNT, (select count(1) from SEGMENTREADING SR LEFT JOIN SEGMENTREADINGWITNESS SRW on SRW.SEGMENTREADINGID=SR.SEGMENTREADINGID where SR.SEGMENTID=S.SEGMENTID and SR.READINGNAME='zz' and SRW.PROJECTID="+params.projectID+") ZZCOUNT from SEGMENT S left join SEGMENTGROUPSEGMENT SG on SG.SEGMENTID=S.SEGMENTID WHERE SEGMENTGROUPID="+params.projectID+" HAVING ACOUNT <= " + amax + " or ZZCOUNT<= " + zzmax + " ORDER BY VERSE, CONTEXTDESCRIPTION;"; List rows = query.getDataSet(sql); Collections.sort(rows); if (html) { results.append("

Segments with 'a' reading witness count <= " + amax + " or 'zz' witness count <= " + zzmax + " - potential problems (or overlapping segments): " + rows.size() + "

"); results.append(""); } else { results.append(""); } for (Segment d : rows) { if (html) { results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) results.append("
VerseContext Description'a' wit
count
'zz' wit
count
FixMeURL
"+d.getVerseTextShort()+""+d.getContextDescription()+""+d.getLongValue("ACOUNT")+""+d.getLongValue("ZZCOUNT")+"fix me
"); else results.append(""); } if (params.appReadingWitnessZero) { ++checkCount; Segment query = new Segment(); String sql = "select VERSE, CONTEXTDESCRIPTION, READINGNAME, READING, SR.CREATEDATE, SR.USERID, SR.SEGMENTREADINGID, SR.PROJECTID from SEGMENTREADING SR left join SEGMENT S on S.SEGMENTID=SR.SEGMENTID left join SEGMENTGROUPSEGMENT SGS on SGS.SEGMENTID=SR.SEGMENTID where not exists (select 1 from SEGMENTREADINGWITNESS SRW where SRW.SEGMENTID=SR.SEGMENTID and SRW.SEGMENTREADINGID=SR.SEGMENTREADINGID) and VERSE between 2001000000 and 2002000000 and SEGMENTGROUPID=" + params.projectID + " and READINGNAME <> 'a'"; List rows = query.getDataSet(sql); Collections.sort(rows); if (html) { results.append("

Segments with non-'a' reading witness count = 0; count: " + rows.size() + "

"); results.append(""); } else { results.append(""); } for (Segment d : rows) { if (html) { results.append(""); results.append(""); results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) { results.append("
VerseContext DescriptionReading
Label
Reading
Text
Create
Date
UserProjectIDFixMeURLShowMeURLDeleteMeURL
"+d.getVerseTextShort()+""+d.getContextDescription()+""+d.getStringValue("READINGNAME")+""+d.getStringValue("READING")+""+d.getStringValue("CREATEDATE")+""+d.getStringValue("USERID")+""+d.getStringValue("PROJECTID")+"fix meshow medelete me
"); results.append(""); } else results.append(""); } if (checkCount > 0) { if (html) { %> <%=results%> <% } else { response.setContentType("text/xml"); %> <%=results%> <% } return; } } params.format = "html"; Serializer.reportErrors(request, response, out, params, true); %>