wxIScan
|
00001 /*************************************************************** 00002 * Name: wxiscanhocr2pdf.h 00003 * Purpose: ... 00004 * Author: Daniel Nell (daniel.nell@nellresearch.de) 00005 * Created: 2012-02-03 00006 * Copyright: Daniel Nell (www.nellresearch.de) 00007 * License: GPL 00008 **************************************************************/ 00009 00010 #ifndef WXISCANHOCR2PDF_H 00011 #define WXISCANHOCR2PDF_H 00012 00013 00014 ////////////////////////////////////////////////////////// 00015 // Predefined classes 00016 // 00017 #if __WXPDFDOC__ 00018 class wxPdfDocument; 00019 class wxXmlDocument; 00020 class wxXmlNode; 00021 00022 00023 ////////////////////////////////////////////////////////// 00024 // Class wxIScanFrameHocr2Pdf 00025 // 00026 /// \brief DOM tree traverser class for 00027 /// wxIScanFrame::AddPdfPage(). 00028 /// 00029 /// This is a helper class that traverses a hOCR DOM tree 00030 /// and "prints" the text of a hOCR XML file hidden behind 00031 /// the corresponding position on the image. 00032 /// 00033 /// NOTE: 00034 /// 00035 /// 1) This is some sort of a "private" class to wxIScanFrame 00036 /// and should not be used outside wxIScanFrame::AddPdfPage(). 00037 /// 00038 /// 2) There is no validity check done on poXmlDoc, poPdfDoc 00039 /// and nResolution. That is it is assumed that all 00040 /// parameters of the constructor are valid. 00041 class wxIScanHocr2Pdf 00042 { 00043 public: 00044 /// \brief Standard constructor. 00045 /// 00046 /// \param poXmlDoc the (valid!) pointer to the XML DOM tree 00047 /// \param poPdfDoc the (valid!) pointer to the PDF document 00048 /// \param nResolution the (virtual) resolution of the image 00049 /// \param strHocrClassFilter the class to use for hOCR information (e. g. whole lines or words) 00050 wxIScanHocr2Pdf( wxXmlDocument *poXmlDoc, 00051 wxPdfDocument *poPdfDoc, 00052 int nResolution, 00053 const wxString &strHocrClassFilter= wxT( "ocr_line" ) ); 00054 00055 /// \brief Virtual destructor; 00056 ~wxIScanHocr2Pdf(){} 00057 00058 /// \brief Traverse the DOM tree by calling TraverseXmlNodes() 00059 /// and flushing outstanding operations in the end. 00060 virtual void Run(); 00061 00062 protected: 00063 00064 /// \brief Traverse recursively through the DOM tree 00065 /// beginning with the given node. 00066 /// 00067 /// \param poNode pointer to start node in the DOM tree. 00068 virtual void TraverseXmlNodes( wxXmlNode *poNode ); 00069 00070 /// \brief Get all text content from all levels below this node. 00071 /// 00072 /// \param poNode pointer to current node in the DOM tree. 00073 virtual wxString GetNodeContent( wxXmlNode *poNode ); 00074 00075 /// \brief Print the given text at the given coordinates. 00076 /// 00077 /// \param x abscissa of the origin 00078 /// \param y ordinate of the origin 00079 /// \param strText text to print 00080 /// 00081 /// NOTE: 00082 /// If you want to change the behaviour of the text placement 00083 /// you should override this function. 00084 virtual void Print2Pdf( double x, double y, const wxString& strText ); 00085 00086 /// \brief Flush outstanding print-to-PDF-commands. 00087 /// 00088 /// NOTE: 00089 /// This function does nothing, but can be overriden. 00090 virtual void Flush2Pdf(){} 00091 00092 public: 00093 wxString m_strHocrClassFilter; ///< Filter for the hOCR 'class' attribute. 00094 00095 protected: 00096 wxXmlDocument *m_poXmlDoc; ///< The pointer to the XML document. 00097 wxPdfDocument *m_poPdfDoc; ///< The pointer to the PDF document. 00098 int m_nResolution; ///< The (fictive) resolution of an image in dpi. 00099 }; 00100 00101 00102 ////////////////////////////////////////////////////////// 00103 // Class wxIScanSmartHocr2Pdf 00104 // 00105 /// \brief Traverse recursively through the DOM tree 00106 /// beginning with the given node. 00107 /// 00108 /// This class extends wxIScanHocr2Pdf by "smart" glueing 00109 /// the letters of one word together. This makes it easer 00110 /// to search a PDF document for words (instead of single) 00111 /// letters). 00112 /// 00113 /// NOTE: 00114 /// 00115 /// 1) This is some sort of a "private" class to wxIScanFrame 00116 /// and should not be used outside wxIScanFrame::AddPdfPage(). 00117 /// 00118 /// 2) There is no validity check done on poXmlDoc, poPdfDoc 00119 /// and nResolution. That is it is assumed that all 00120 /// parameters of the constructor are valid. 00121 class wxIScanSmartHocr2Pdf: public wxIScanHocr2Pdf 00122 { 00123 public: 00124 /// \brief DOM tree traverser class for wxIScanFrame::AddPdfPage() 00125 /// based on wxIScanFrameHocr2Pdf class to achieve a 00126 /// "smarter" word based PDF output hidden behind the image. 00127 /// 00128 /// \param poXmlDoc the (valid!) pointer to the XML DOM tree 00129 /// \param poPdfDoc the (valid!) pointer to the PDF document 00130 /// \param nResolution the (virtual) resolution of the image 00131 /// \param strHocrClassFilter the class to use for hOCR information (e. g. whole lines or words) 00132 wxIScanSmartHocr2Pdf( wxXmlDocument *poXmlDoc, 00133 wxPdfDocument *poPdfDoc, 00134 int nResolution, 00135 const wxString &strHocrClassFilter= wxT( "ocr_line" ) ); 00136 00137 protected: 00138 /// \brief Print the given text at the given coordinates. 00139 /// 00140 /// \param x abscissa of the origin 00141 /// \param y ordinate of the origin 00142 /// \param strText text to print 00143 /// 00144 /// NOTE: 00145 /// This function only collects text until Flush2Pdf() 00146 /// is called. 00147 virtual void Print2Pdf( double x, double y, const wxString& strText ); 00148 00149 /// \brief Flush outstanding print-to-PDF-commands. 00150 /// 00151 /// NOTE: 00152 /// This function does the real printing initialized by 00153 /// (maybe multiple) calls to Print2Pdf(). 00154 virtual void Flush2Pdf(); 00155 00156 protected: 00157 double m_x, m_y; ///< Save printing coordinates. 00158 wxString m_strText; ///< Save outstanding text. 00159 }; 00160 #endif // __WXPDFDOC__ 00161 00162 #endif // WXISCANHOCR2PDF_H