<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ page trimDirectiveWhitespaces="true" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Feature" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Document" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Apparatus.Segment" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Page" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ShelfInstance" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ProjectManagement" %> <%@ page import="org.crosswire.community.projects.ntmss.data.ProjectManagement.Project" %> <%@ page import="org.crosswire.community.projects.ntmss.data.Transcription" %> <%@ page import="org.crosswire.xml.XMLBlock" %> <%@ page import="org.crosswire.sword.keys.ListKey" %> <%@ page import="org.crosswire.sword.keys.VerseKey" %> <%@ page import="java.util.Set" %> <%@ page import="java.util.HashSet" %> <%@ page import="java.util.HashMap" %> <%@ page import="java.util.Map" %> <%@ page import="java.util.Collections" %> <%@ page import="java.util.Arrays" %> <%@ page import="java.util.List" %> <%@ page import="java.util.ArrayList" %> <%@ page import="org.apache.log4j.Logger" %> <%@ page import="org.crosswire.webtools.annotation.*" %> <%@ page import="org.crosswire.webtools.*" %> <%! @Description(value = "Perform sanity check of indexing to transcription", name = "statistics/indexing/comparetranscription") public static class MyParameters extends Parameters { protected ProjectManagement.Project project = null; protected ListKey verses = null; @Description(value = "Limit check to verse range", defaultValue = "", example = "Exod; Lev.1-5") public String indexContent = null; @Description(value = "Set versification system for indexContent", defaultValue = "", example = "LXXNU") public String indexContentV11n = null; @Description(value = "Limit results to docID min", example = "10000") public Integer docIDMin = null; @Description(value = "Limit results to docID max", example = "19999") public Integer docIDMax = null; @Description(value = "Limit checks to a Document Group", example = "1") public Integer documentGroupID; @Description(value = "Limit checks to a Project ID", defaultValue = "1", example = "43") public Integer projectID = 1; @Description(value = "Limit checks to a Project Name", example = "ECM Matthew") public String projectName; @Description(value = "Show this usage help", example = "true", defaultValue = "false") public Boolean help = false; @Override protected void afterLoad() { } @Override protected void customValidation() { projectName = Transcription.assureUnicode(projectName); if (request.getParameter("projectID") != null || projectName != null) { project = projectID != null ? ProjectManagement.getProject(projectID) : ProjectManagement.getProject(projectName); if (project == null) { addError(-7, "Project not found."); return; } } if (project != null && documentGroupID == null) { documentGroupID = project.getDocumentGroupID(); } if (this.indexContent != null) { VerseKey vk = new VerseKey(); vk.setIntros(true); if (this.indexContentV11n != null) vk.setVersificationSystem(this.indexContentV11n); verses = vk.ParseVerseList(this.indexContent, "Mat.1.1", true); } } } %> <% MyParameters params = new MyParameters().loadFromRequest(request, response, false); if (params.getErrors().size() == 0 && !params.help) { int checkCount = 0; String format = request.getParameter("format"); if (format == null) format = "html"; boolean html = "html".equals(format); StringBuffer results = new StringBuffer(); String baseURL = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); // + request.getContextPath(); ++checkCount; Page query = new Page(); String sql = "SELECT D.*, P.*, count(BC.VERSE) VERSECOUNTBC from DOCUMENT D join PAGE P on P.DOCUMENTID=D.DOCUMENTID right join BIBLICALCONTENT BC on BC.DOCUMENTID=P.DOCUMENTID and BC.PAGEID=P.PAGEID WHERE 1=1"; if (params.docIDMin != null) sql += " AND D.DOCUMENTID >= " + params.docIDMin; if (params.docIDMax != null) sql += " AND D.DOCUMENTID <= " + params.docIDMax; if (params.verses != null && params.verses.getElement() != null) { sql += " AND (1=0"; for (int i = 0; i < params.verses.getCount(); ++i) { VerseKey verseKey = (VerseKey) params.verses.getElement(i); sql += " OR (BC.VERSE >= " + verseKey.getLowerBound().getHashNumber() + " AND BC.VERSE <= " + verseKey.getUpperBound().getHashNumber() + ")"; } sql += " )"; } sql += " GROUP BY P.DOCUMENTID, P.PAGEID"; params.getLogger().info("integrityCheck SQL: " + sql); List rows = query.getDataSet(sql); List problems = new ArrayList(); Map differences = new HashMap(); for (Page p : rows) { ListKey ic = p.getIndexContent(); List icVerses = new ArrayList(); ListKey tic = p.getIndexContentFromTranscription(false); List ticVerses = new ArrayList(); for (ic.setPosition(ic.TOP); ic.popError() == 0; ic.increment()) { int ich = new VerseKey(ic).getHashNumber(); if (ich % 1000 != 0 || ich % 1000000 == 0) { icVerses.add(new VerseKey(ic).getHashNumber()); } } for (tic.setPosition(ic.TOP); tic.popError() == 0; tic.increment()) { ticVerses.add(new VerseKey(tic).getHashNumber()); } Collections.sort(icVerses); Collections.sort(ticVerses); boolean same = Arrays.equals(icVerses.toArray(new Integer[0]), ticVerses.toArray(new Integer[0])); if ((ticVerses.size() != 0 && icVerses.size() != 0) && !same) { problems.add(p); Set icVersesExtras = new HashSet(icVerses); icVersesExtras.removeAll(ticVerses); Set ticVersesExtras = new HashSet(ticVerses); ticVersesExtras.removeAll(icVerses); icVersesExtras.addAll(ticVersesExtras); differences.put(p, Page.getBiblicalContent(new ArrayList(icVersesExtras), params.indexContentV11n)); } } if (html) { results.append("

Pages with transcription verses which do not match index content; problems: " + problems.size() + "

"); results.append(""); results.append(""); } else { results.append(""); } for (Page d : problems) { if (html) { results.append(""); results.append(""); results.append(""); results.append(""); results.append(""); results.append(""); } else { results.append(""); } } if (html) results.append("
DocIDPageIDindexContenttranscriptionIndexContentDifferencesCheck BiblesFixMeURL IndexingFixMeURL Transcription
"+d.getDocumentID()+""+d.getPageID()+""+d.getIndexContent().getShortRangeText()+""+d.getIndexContentFromTranscription(false).getShortRangeText()+"" + differences.get(d) + "Biblesfix me: indexingfix me: transcription
"); else results.append(""); if (checkCount > 0) { if (html) { %> <%=results%> <% } else { response.setContentType("text/xml"); %> <%=results%> <% } return; } } params.format = "html"; Serializer.reportErrors(request, response, out, params, true); %>