wxIScan
wxiscanhocr2pdf.cpp
Go to the documentation of this file.
00001 /***************************************************************
00002  * Name:      wxiscanhocr2pdf.cpp
00003  * Purpose:   ...
00004  * Author:    Daniel Nell (daniel.nell@nellresearch.de)
00005  * Created:   2012-02-03
00006  * Copyright: Daniel Nell (www.nellresearch.de)
00007  * License:   GPL
00008  **************************************************************/
00009 
00010 // Include precompiled headers.
00011 #include "wx_pch.h"
00012 
00013 
00014 #if __WXPDFDOC__
00015 // wxWidgets headers
00016 #   include <wx/pdfdoc.h>
00017 #   include <wx/tokenzr.h>
00018 #   include <wx/xml/xml.h>
00019 
00020 // Private headers
00021 #   include "wxiscanhocr2pdf.h"
00022 
00023 
00024 
00025 //////////////////////////////////////////////////////////
00026 // Class wxIScanFrameHocr2Pdf
00027 //
00028 // Standard constructor.
00029 //
00030 wxIScanHocr2Pdf::wxIScanHocr2Pdf( wxXmlDocument *poXmlDoc, wxPdfDocument *poPdfDoc,
00031                                  int nResolution, const wxString &strHocrClassFilter )
00032  : m_strHocrClassFilter( strHocrClassFilter ),
00033    m_poXmlDoc( poXmlDoc ),
00034    m_poPdfDoc( poPdfDoc ),
00035    m_nResolution( nResolution )
00036 {
00037     // Initialization. (Nothing to do, yet.)
00038 }
00039 
00040 // Traverse the DOM tree by calling TraverseXmlNodes()
00041 // and flushing outstanding operations in the end.
00042 //
00043 void wxIScanHocr2Pdf::Run()
00044 {
00045     TraverseXmlNodes( m_poXmlDoc->GetRoot() );
00046     Flush2Pdf();
00047 }
00048 
00049 // Traverse recursively through the DOM tree beginning with the given node.
00050 //
00051 void wxIScanHocr2Pdf::TraverseXmlNodes( wxXmlNode *poNode )
00052 {
00053     // If this is an XML tag containing a 'title' attribute
00054     // beginning with 'bbox' extract the bounding box and
00055     // the content (the text) and print it on the PDF page
00056     // using the (lower left corner of the) bounding box.
00057     if( poNode->GetType() == wxXML_ELEMENT_NODE )
00058     {
00059         wxString strAttrClass= poNode->GetAttribute( wxT( "class" ), wxEmptyString );
00060         wxString strAttrTitle= poNode->GetAttribute( wxT( "title" ), wxEmptyString );
00061 
00062         if(    ( strAttrClass.IsEmpty() || !strAttrClass.Cmp( m_strHocrClassFilter ) )
00063             && strAttrTitle.StartsWith( wxT( "bbox" ) ) )
00064         {
00065             // Parse string, ...
00066             wxArrayString astrTokens= wxStringTokenize( strAttrTitle );
00067 
00068             // ... get the coordinates of the bounding box, and ...
00069             long x, y;
00070 
00071             astrTokens[1].ToLong( &x );
00072             astrTokens[4].ToLong( &y );
00073 
00074             // ... "print" the text on the PDF page.
00075             Print2Pdf( (double)x / (double)m_nResolution * 25.4,
00076                        (double)y / (double)m_nResolution * 25.4,
00077                        GetNodeContent( poNode ) );
00078         }
00079     }
00080     else if( poNode->IsWhitespaceOnly() )
00081     {
00082         // Flush eventually delayed output.
00083         Flush2Pdf();
00084     }
00085 
00086     // Do the same for all children of this XML node
00087     // (so doing a depth first search).
00088     for( wxXmlNode *poChildNode= poNode->GetChildren(); poChildNode; poChildNode= poChildNode->GetNext() )
00089     {
00090         TraverseXmlNodes( poChildNode );
00091     }
00092 }
00093 
00094 // Get all text content from all levels below this node.
00095 //
00096 wxString wxIScanHocr2Pdf::GetNodeContent( wxXmlNode *poNode )
00097 {
00098     // Get the current node's text content...
00099     wxString strContent= poNode->GetNodeContent();
00100 
00101     // ... and concatenate with the child node's text content.
00102     for( wxXmlNode *poIteratorNode= poNode->GetChildren(); poIteratorNode; poIteratorNode= poIteratorNode->GetNext() )
00103     {
00104         strContent += GetNodeContent( poIteratorNode );
00105     }
00106     return strContent;
00107 }
00108 
00109 // Print the given text at the given coordinates.
00110 //
00111 void wxIScanHocr2Pdf::Print2Pdf( double x, double y, const wxString& strText )
00112 {
00113     m_poPdfDoc->Text( x, y, strText );
00114 }
00115 
00116 
00117 //////////////////////////////////////////////////////////
00118 // Class wxIScanSmartHocr2Pdf
00119 //
00120 // Standard constructor.
00121 //
00122 wxIScanSmartHocr2Pdf::wxIScanSmartHocr2Pdf( wxXmlDocument *poXmlDoc, wxPdfDocument *poPdfDoc,
00123                                             int nResolution, const wxString &strHocrClassFilter )
00124   : wxIScanHocr2Pdf( poXmlDoc, poPdfDoc, nResolution, strHocrClassFilter )
00125 {
00126     // Clear the collection buffer.
00127     m_strText.Empty();
00128 
00129     // Reset ...
00130     m_y= -1.0;
00131 }
00132 
00133 // "Print" the given text at the given coordinates.
00134 //
00135 // Note: This function only collects (delays) text until a
00136 //       flush is made using Flush2Pdf().
00137 //
00138 void wxIScanSmartHocr2Pdf::Print2Pdf( double x, double y, const wxString& strText )
00139 {
00140     // ...
00141     if( m_y < 0.0 )
00142     {
00143         m_x= x;
00144         m_y= y;
00145     }
00146 
00147     // Add the new text to the already collected text.
00148     m_strText += strText;
00149 }
00150 
00151 // Flush outstanding print-to-PDF-commands.
00152 //
00153 // Note: This function flushes the text collected (delayed)
00154 //       by Print2Pdf().
00155 //
00156 void wxIScanSmartHocr2Pdf::Flush2Pdf()
00157 {
00158     if( m_y >= 0.0 )
00159     {
00160         // "Print" the collected text.
00161         m_poPdfDoc->Text( m_x, m_y, m_strText );
00162 
00163         // Empty the collection buffer.
00164         m_strText.Empty();
00165 
00166         // Reset...
00167         m_y= -1.0;
00168     }
00169 }
00170 #endif // __WXPDFDOC__