java - convert pdf to html pagewise using pdfbox -

- March 15, 2010

i converting pdf html overriding processtextposition method in pdftextstripper know getting entire text in single .html want every pdfpage .html there way through processtextposition method code is: here "f" path processtextposition called sub class pdftext2html

protected void processtextposition( textposition text ) {       boolean showcharacter = true;     positionwrapper p =new positionwrapper(text);     textposition t=p.gettextposition();       try     {                 f.createnewfile();             bw= new bufferedwriter(new filewriter(f, true) );                   float a;                 = t.gettextpos().getyposition();                 b=compare(a,b,t);                 system.out.println(b);   }  catch (ioexception e)  {     // todo auto-generated catch block     e.printstacktrace(); } {     try          {          bw.flush();          bw.close();         } catch (ioexception e)           {            // todo auto-generated catch block             e.printstacktrace();           }  }       if(suppressduplicateoverlappingtext)     {         showcharacter = false;         string textcharacter = text.getcharacter();         float textx = text.getx();         float texty = text.gety();         treemap<float, treeset<float>> sametextcharacters = characterlistmapping.get( textcharacter );         if(sametextcharacters == null )         {             sametextcharacters = new treemap<float, treeset<float>>();             characterlistmapping.put( textcharacter, sametextcharacters );         }          boolean suppresscharacter = false;         float tolerance = (text.getwidth()/textcharacter.length())/3.0f;         sortedmap<float, treeset<float>> xmatches =             sametextcharacters.submap(textx - tolerance, textx + tolerance);         for(treeset<float> xmatch : xmatches.values())          {             sortedset<float> ymatches =                 xmatch.subset(texty - tolerance , texty + tolerance);             if (!ymatches.isempty())              {                 suppresscharacter = true;                 break;             }         }         if( !suppresscharacter )         {             treeset<float> yset = sametextcharacters.get(textx);             if (yset == null)              {                 yset = new treeset<float>();                 sametextcharacters.put( textx,  yset );             }             yset.add( texty );             showcharacter = true;         }     }     if( showcharacter )     {          int foundarticledivisionindex = -1;         int notfoundbutfirstleftandabovearticledivisionindex = -1;         int notfoundbutfirstleftarticledivisionindex = -1;         int notfoundbutfirstabovearticledivisionindex = -1;         float x = text.getx();         float y = text.gety();         if( shouldseparatebybeads )         {             for( int i=0; i<pagearticles.size() && foundarticledivisionindex == -1; i++ )             {                 pdthreadbead bead = (pdthreadbead)pagearticles.get( );                 if( bead != null )                 {                     pdrectangle rect = bead.getrectangle();                     if(rect.contains( x, y ) )                     {                         foundarticledivisionindex = i*2+1;                     }                     else if( (x < rect.getlowerleftx() ||                             y < rect.getupperrighty()) &&                             notfoundbutfirstleftandabovearticledivisionindex == -1)                     {                         notfoundbutfirstleftandabovearticledivisionindex = i*2;                     }                     else if( x < rect.getlowerleftx() &&                             notfoundbutfirstleftarticledivisionindex == -1)                     {                         notfoundbutfirstleftarticledivisionindex = i*2;                     }                     else if( y < rect.getupperrighty() &&                             notfoundbutfirstabovearticledivisionindex == -1)                     {                         notfoundbutfirstabovearticledivisionindex = i*2;                     }                 }                 else                 {                     foundarticledivisionindex = 0;                 }             }         }         else         {             foundarticledivisionindex = 0;         }         int articledivisionindex = -1;         if( foundarticledivisionindex != -1 )         {             articledivisionindex = foundarticledivisionindex;         }         else if( notfoundbutfirstleftandabovearticledivisionindex != -1 )         {             articledivisionindex = notfoundbutfirstleftandabovearticledivisionindex;         }         else if( notfoundbutfirstleftarticledivisionindex != -1 )         {             articledivisionindex = notfoundbutfirstleftarticledivisionindex;         }         else if( notfoundbutfirstabovearticledivisionindex != -1 )         {             articledivisionindex = notfoundbutfirstabovearticledivisionindex;         }         else         {             articledivisionindex = charactersbyarticle.size()-1;         }          list<textposition> textlist = (list<textposition>) charactersbyarticle.get( articledivisionindex );           if(textlist.isempty())         {             textlist.add(text);         }         else         {             textposition previoustextposition = (textposition)textlist.get(textlist.size()-1);             if(text.isdiacritic() && previoustextposition.contains(text))             {                 previoustextposition.mergediacritic(text, normalize);             }             /* if previous textposition diacritic, merge              * 1 , remove list. */             else if(previoustextposition.isdiacritic() && text.contains(previoustextposition))             {                 text.mergediacritic(previoustextposition, normalize);                 textlist.remove(textlist.size()-1);                 textlist.add(text);             }             else             {                 textlist.add(text);             }         }     }     }

and compare method body is

private float compare(float a, float b, textposition t) throws ioexception  {  if(a==b) {      bw.write("<span style=\"font-size:"+t.getfontsizeinpt()+"pt;"+ "fontfamily:"+t.getfont().getbasefont()+ "width:"+t.getwidth()+"left:"+t.gettextpos().getxposition()+"pt;top:"+t.gettextpos().getyposition()+";\">"+t.getcharacter()+"</span>");     b=a; } else {     b=a;      bw.write("<br>"+"<span style=\"font-size:"+t.getfontsizeinpt()+"pt;"+ "fontfamily:"+t.getfont().getbasefont()+ "width:"+t.getwidth()+"left:"+t.gettextpos().getxposition()+"pt;top:"+t.gettextpos().getyposition()+";\">"+t.getcharacter()+"</span>"); }     return b; }

Search This Blog

Crty

java - convert pdf to html pagewise using pdfbox -

Comments

Post a Comment

Popular posts from this blog

c# - MSAA finds controls UI Automation doesn't -

python - mat is not a numerical tuple : openCV error -

wordpress - .htaccess: RewriteRule: bad flag delimiters -