wxIScan
wxiscanhocr2pdf.h
Go to the documentation of this file.
00001 /***************************************************************
00002  * Name:      wxiscanhocr2pdf.h
00003  * Purpose:   ...
00004  * Author:    Daniel Nell (daniel.nell@nellresearch.de)
00005  * Created:   2012-02-03
00006  * Copyright: Daniel Nell (www.nellresearch.de)
00007  * License:   GPL
00008  **************************************************************/
00009 
00010 #ifndef WXISCANHOCR2PDF_H
00011 #define WXISCANHOCR2PDF_H
00012 
00013 
00014 //////////////////////////////////////////////////////////
00015 // Predefined classes
00016 //
00017 #if __WXPDFDOC__
00018 class wxPdfDocument;
00019 class wxXmlDocument;
00020 class wxXmlNode;
00021 
00022 
00023 //////////////////////////////////////////////////////////
00024 // Class wxIScanFrameHocr2Pdf
00025 //
00026 /// \brief DOM tree traverser class for
00027 ///        wxIScanFrame::AddPdfPage().
00028 ///
00029 /// This is a helper class that traverses a hOCR DOM tree
00030 /// and "prints" the text of a hOCR XML file hidden behind
00031 /// the corresponding position on the image.
00032 ///
00033 /// NOTE:
00034 ///
00035 /// 1) This is some sort of a "private" class to wxIScanFrame
00036 ///    and should not be used outside wxIScanFrame::AddPdfPage().
00037 ///
00038 /// 2) There is no validity check done on poXmlDoc, poPdfDoc
00039 ///    and nResolution. That is it is assumed that all
00040 ///    parameters of the constructor are valid.
00041 class wxIScanHocr2Pdf
00042 {
00043   public:
00044     /// \brief Standard constructor.
00045     ///
00046     /// \param poXmlDoc             the (valid!) pointer to the XML DOM tree
00047     /// \param poPdfDoc             the (valid!) pointer to the PDF document
00048     /// \param nResolution          the (virtual) resolution of the image
00049     /// \param strHocrClassFilter   the class to use for hOCR information (e. g. whole lines or words)
00050     wxIScanHocr2Pdf( wxXmlDocument *poXmlDoc,
00051                      wxPdfDocument *poPdfDoc,
00052                      int nResolution,
00053                      const wxString &strHocrClassFilter= wxT( "ocr_line" ) );
00054 
00055     /// \brief Virtual destructor;
00056     ~wxIScanHocr2Pdf(){}
00057 
00058     /// \brief Traverse the DOM tree by calling TraverseXmlNodes()
00059     ///        and flushing outstanding operations in the end.
00060     virtual void Run();
00061 
00062   protected:
00063 
00064     /// \brief Traverse recursively through the DOM tree
00065     ///        beginning with the given node.
00066     ///
00067     /// \param poNode      pointer to start node in the DOM tree.
00068     virtual void TraverseXmlNodes( wxXmlNode *poNode );
00069 
00070     /// \brief Get all text content from all levels below this node.
00071     ///
00072     /// \param poNode      pointer to current node in the DOM tree.
00073     virtual wxString GetNodeContent( wxXmlNode *poNode );
00074 
00075     /// \brief Print the given text at the given coordinates.
00076     ///
00077     /// \param x            abscissa of the origin
00078     /// \param y            ordinate of the origin
00079     /// \param strText      text to print
00080     ///
00081     /// NOTE:
00082     ///        If you want to change the behaviour of the text placement
00083     ///        you should override this function.
00084     virtual void Print2Pdf( double x, double y, const wxString& strText );
00085 
00086     /// \brief Flush outstanding print-to-PDF-commands.
00087     ///
00088     /// NOTE:
00089     ///        This function does nothing, but can be overriden.
00090     virtual void Flush2Pdf(){}
00091 
00092   public:
00093     wxString m_strHocrClassFilter;      ///< Filter for the hOCR 'class' attribute.
00094 
00095   protected:
00096     wxXmlDocument *m_poXmlDoc;          ///< The pointer to the XML document.
00097     wxPdfDocument *m_poPdfDoc;          ///< The pointer to the PDF document.
00098     int m_nResolution;                  ///< The (fictive) resolution of an image in dpi.
00099 };
00100 
00101 
00102 //////////////////////////////////////////////////////////
00103 // Class wxIScanSmartHocr2Pdf
00104 //
00105 /// \brief Traverse recursively through the DOM tree
00106 ///        beginning with the given node.
00107 ///
00108 /// This class extends wxIScanHocr2Pdf by "smart" glueing
00109 /// the letters of one word together. This makes it easer
00110 /// to search a PDF document for words (instead of single)
00111 /// letters).
00112 ///
00113 /// NOTE:
00114 ///
00115 /// 1) This is some sort of a "private" class to wxIScanFrame
00116 ///    and should not be used outside wxIScanFrame::AddPdfPage().
00117 ///
00118 /// 2) There is no validity check done on poXmlDoc, poPdfDoc
00119 ///    and nResolution. That is it is assumed that all
00120 ///    parameters of the constructor are valid.
00121 class wxIScanSmartHocr2Pdf: public wxIScanHocr2Pdf
00122 {
00123   public:
00124     /// \brief DOM tree traverser class for wxIScanFrame::AddPdfPage()
00125     ///        based on wxIScanFrameHocr2Pdf class to achieve a
00126     ///        "smarter" word based PDF output hidden behind the image.
00127     ///
00128     /// \param poXmlDoc             the (valid!) pointer to the XML DOM tree
00129     /// \param poPdfDoc             the (valid!) pointer to the PDF document
00130     /// \param nResolution          the (virtual) resolution of the image
00131     /// \param strHocrClassFilter   the class to use for hOCR information (e. g. whole lines or words)
00132     wxIScanSmartHocr2Pdf( wxXmlDocument *poXmlDoc,
00133                           wxPdfDocument *poPdfDoc,
00134                           int nResolution,
00135                           const wxString &strHocrClassFilter= wxT( "ocr_line" ) );
00136 
00137   protected:
00138     /// \brief Print the given text at the given coordinates.
00139     ///
00140     /// \param x            abscissa of the origin
00141     /// \param y            ordinate of the origin
00142     /// \param strText      text to print
00143     ///
00144     /// NOTE:
00145     ///        This function only collects text until Flush2Pdf()
00146     ///        is called.
00147     virtual void Print2Pdf( double x, double y, const wxString& strText );
00148 
00149     /// \brief Flush outstanding print-to-PDF-commands.
00150     ///
00151     /// NOTE:
00152     ///        This function does the real printing initialized by
00153     ///        (maybe multiple) calls to Print2Pdf().
00154     virtual void Flush2Pdf();
00155 
00156   protected:
00157     double m_x, m_y;                    ///< Save printing coordinates.
00158     wxString m_strText;                 ///< Save outstanding text.
00159 };
00160 #endif // __WXPDFDOC__
00161 
00162 #endif // WXISCANHOCR2PDF_H