![]() |
wxIScan
|
00001 /*************************************************************** 00002 * Name: wxiscanhocr2pdf.cpp 00003 * Purpose: ... 00004 * Author: Daniel Nell (daniel.nell@nellresearch.de) 00005 * Created: 2012-02-03 00006 * Copyright: Daniel Nell (www.nellresearch.de) 00007 * License: GPL 00008 **************************************************************/ 00009 00010 // Include precompiled headers. 00011 #include "wx_pch.h" 00012 00013 00014 #if __WXPDFDOC__ 00015 // wxWidgets headers 00016 # include <wx/pdfdoc.h> 00017 # include <wx/tokenzr.h> 00018 # include <wx/xml/xml.h> 00019 00020 // Private headers 00021 # include "wxiscanhocr2pdf.h" 00022 00023 00024 00025 ////////////////////////////////////////////////////////// 00026 // Class wxIScanFrameHocr2Pdf 00027 // 00028 // Standard constructor. 00029 // 00030 wxIScanHocr2Pdf::wxIScanHocr2Pdf( wxXmlDocument *poXmlDoc, wxPdfDocument *poPdfDoc, 00031 int nResolution, const wxString &strHocrClassFilter ) 00032 : m_strHocrClassFilter( strHocrClassFilter ), 00033 m_poXmlDoc( poXmlDoc ), 00034 m_poPdfDoc( poPdfDoc ), 00035 m_nResolution( nResolution ) 00036 { 00037 // Initialization. (Nothing to do, yet.) 00038 } 00039 00040 // Traverse the DOM tree by calling TraverseXmlNodes() 00041 // and flushing outstanding operations in the end. 00042 // 00043 void wxIScanHocr2Pdf::Run() 00044 { 00045 TraverseXmlNodes( m_poXmlDoc->GetRoot() ); 00046 Flush2Pdf(); 00047 } 00048 00049 // Traverse recursively through the DOM tree beginning with the given node. 00050 // 00051 void wxIScanHocr2Pdf::TraverseXmlNodes( wxXmlNode *poNode ) 00052 { 00053 // If this is an XML tag containing a 'title' attribute 00054 // beginning with 'bbox' extract the bounding box and 00055 // the content (the text) and print it on the PDF page 00056 // using the (lower left corner of the) bounding box. 00057 if( poNode->GetType() == wxXML_ELEMENT_NODE ) 00058 { 00059 wxString strAttrClass= poNode->GetAttribute( wxT( "class" ), wxEmptyString ); 00060 wxString strAttrTitle= poNode->GetAttribute( wxT( "title" ), wxEmptyString ); 00061 00062 if( ( strAttrClass.IsEmpty() || !strAttrClass.Cmp( m_strHocrClassFilter ) ) 00063 && strAttrTitle.StartsWith( wxT( "bbox" ) ) ) 00064 { 00065 // Parse string, ... 00066 wxArrayString astrTokens= wxStringTokenize( strAttrTitle ); 00067 00068 // ... get the coordinates of the bounding box, and ... 00069 long x, y; 00070 00071 astrTokens[1].ToLong( &x ); 00072 astrTokens[4].ToLong( &y ); 00073 00074 // ... "print" the text on the PDF page. 00075 Print2Pdf( (double)x / (double)m_nResolution * 25.4, 00076 (double)y / (double)m_nResolution * 25.4, 00077 GetNodeContent( poNode ) ); 00078 } 00079 } 00080 else if( poNode->IsWhitespaceOnly() ) 00081 { 00082 // Flush eventually delayed output. 00083 Flush2Pdf(); 00084 } 00085 00086 // Do the same for all children of this XML node 00087 // (so doing a depth first search). 00088 for( wxXmlNode *poChildNode= poNode->GetChildren(); poChildNode; poChildNode= poChildNode->GetNext() ) 00089 { 00090 TraverseXmlNodes( poChildNode ); 00091 } 00092 } 00093 00094 // Get all text content from all levels below this node. 00095 // 00096 wxString wxIScanHocr2Pdf::GetNodeContent( wxXmlNode *poNode ) 00097 { 00098 // Get the current node's text content... 00099 wxString strContent= poNode->GetNodeContent(); 00100 00101 // ... and concatenate with the child node's text content. 00102 for( wxXmlNode *poIteratorNode= poNode->GetChildren(); poIteratorNode; poIteratorNode= poIteratorNode->GetNext() ) 00103 { 00104 strContent += GetNodeContent( poIteratorNode ); 00105 } 00106 return strContent; 00107 } 00108 00109 // Print the given text at the given coordinates. 00110 // 00111 void wxIScanHocr2Pdf::Print2Pdf( double x, double y, const wxString& strText ) 00112 { 00113 m_poPdfDoc->Text( x, y, strText ); 00114 } 00115 00116 00117 ////////////////////////////////////////////////////////// 00118 // Class wxIScanSmartHocr2Pdf 00119 // 00120 // Standard constructor. 00121 // 00122 wxIScanSmartHocr2Pdf::wxIScanSmartHocr2Pdf( wxXmlDocument *poXmlDoc, wxPdfDocument *poPdfDoc, 00123 int nResolution, const wxString &strHocrClassFilter ) 00124 : wxIScanHocr2Pdf( poXmlDoc, poPdfDoc, nResolution, strHocrClassFilter ) 00125 { 00126 // Clear the collection buffer. 00127 m_strText.Empty(); 00128 00129 // Reset ... 00130 m_y= -1.0; 00131 } 00132 00133 // "Print" the given text at the given coordinates. 00134 // 00135 // Note: This function only collects (delays) text until a 00136 // flush is made using Flush2Pdf(). 00137 // 00138 void wxIScanSmartHocr2Pdf::Print2Pdf( double x, double y, const wxString& strText ) 00139 { 00140 // ... 00141 if( m_y < 0.0 ) 00142 { 00143 m_x= x; 00144 m_y= y; 00145 } 00146 00147 // Add the new text to the already collected text. 00148 m_strText += strText; 00149 } 00150 00151 // Flush outstanding print-to-PDF-commands. 00152 // 00153 // Note: This function flushes the text collected (delayed) 00154 // by Print2Pdf(). 00155 // 00156 void wxIScanSmartHocr2Pdf::Flush2Pdf() 00157 { 00158 if( m_y >= 0.0 ) 00159 { 00160 // "Print" the collected text. 00161 m_poPdfDoc->Text( m_x, m_y, m_strText ); 00162 00163 // Empty the collection buffer. 00164 m_strText.Empty(); 00165 00166 // Reset... 00167 m_y= -1.0; 00168 } 00169 } 00170 #endif // __WXPDFDOC__