java - convert pdf to html pagewise using pdfbox -
i converting pdf html overriding processtextposition method in pdftextstripper know getting entire text in single .html want every pdfpage .html there way through processtextposition method code is: here "f" path processtextposition called sub class pdftext2html
protected void processtextposition( textposition text ) { boolean showcharacter = true; positionwrapper p =new positionwrapper(text); textposition t=p.gettextposition(); try { f.createnewfile(); bw= new bufferedwriter(new filewriter(f, true) ); float a; = t.gettextpos().getyposition(); b=compare(a,b,t); system.out.println(b); } catch (ioexception e) { // todo auto-generated catch block e.printstacktrace(); } { try { bw.flush(); bw.close(); } catch (ioexception e) { // todo auto-generated catch block e.printstacktrace(); } } if(suppressduplicateoverlappingtext) { showcharacter = false; string textcharacter = text.getcharacter(); float textx = text.getx(); float texty = text.gety(); treemap<float, treeset<float>> sametextcharacters = characterlistmapping.get( textcharacter ); if(sametextcharacters == null ) { sametextcharacters = new treemap<float, treeset<float>>(); characterlistmapping.put( textcharacter, sametextcharacters ); } boolean suppresscharacter = false; float tolerance = (text.getwidth()/textcharacter.length())/3.0f; sortedmap<float, treeset<float>> xmatches = sametextcharacters.submap(textx - tolerance, textx + tolerance); for(treeset<float> xmatch : xmatches.values()) { sortedset<float> ymatches = xmatch.subset(texty - tolerance , texty + tolerance); if (!ymatches.isempty()) { suppresscharacter = true; break; } } if( !suppresscharacter ) { treeset<float> yset = sametextcharacters.get(textx); if (yset == null) { yset = new treeset<float>(); sametextcharacters.put( textx, yset ); } yset.add( texty ); showcharacter = true; } } if( showcharacter ) { int foundarticledivisionindex = -1; int notfoundbutfirstleftandabovearticledivisionindex = -1; int notfoundbutfirstleftarticledivisionindex = -1; int notfoundbutfirstabovearticledivisionindex = -1; float x = text.getx(); float y = text.gety(); if( shouldseparatebybeads ) { for( int i=0; i<pagearticles.size() && foundarticledivisionindex == -1; i++ ) { pdthreadbead bead = (pdthreadbead)pagearticles.get( ); if( bead != null ) { pdrectangle rect = bead.getrectangle(); if(rect.contains( x, y ) ) { foundarticledivisionindex = i*2+1; } else if( (x < rect.getlowerleftx() || y < rect.getupperrighty()) && notfoundbutfirstleftandabovearticledivisionindex == -1) { notfoundbutfirstleftandabovearticledivisionindex = i*2; } else if( x < rect.getlowerleftx() && notfoundbutfirstleftarticledivisionindex == -1) { notfoundbutfirstleftarticledivisionindex = i*2; } else if( y < rect.getupperrighty() && notfoundbutfirstabovearticledivisionindex == -1) { notfoundbutfirstabovearticledivisionindex = i*2; } } else { foundarticledivisionindex = 0; } } } else { foundarticledivisionindex = 0; } int articledivisionindex = -1; if( foundarticledivisionindex != -1 ) { articledivisionindex = foundarticledivisionindex; } else if( notfoundbutfirstleftandabovearticledivisionindex != -1 ) { articledivisionindex = notfoundbutfirstleftandabovearticledivisionindex; } else if( notfoundbutfirstleftarticledivisionindex != -1 ) { articledivisionindex = notfoundbutfirstleftarticledivisionindex; } else if( notfoundbutfirstabovearticledivisionindex != -1 ) { articledivisionindex = notfoundbutfirstabovearticledivisionindex; } else { articledivisionindex = charactersbyarticle.size()-1; } list<textposition> textlist = (list<textposition>) charactersbyarticle.get( articledivisionindex ); if(textlist.isempty()) { textlist.add(text); } else { textposition previoustextposition = (textposition)textlist.get(textlist.size()-1); if(text.isdiacritic() && previoustextposition.contains(text)) { previoustextposition.mergediacritic(text, normalize); } /* if previous textposition diacritic, merge * 1 , remove list. */ else if(previoustextposition.isdiacritic() && text.contains(previoustextposition)) { text.mergediacritic(previoustextposition, normalize); textlist.remove(textlist.size()-1); textlist.add(text); } else { textlist.add(text); } } } }
and compare method body is
private float compare(float a, float b, textposition t) throws ioexception { if(a==b) { bw.write("<span style=\"font-size:"+t.getfontsizeinpt()+"pt;"+ "fontfamily:"+t.getfont().getbasefont()+ "width:"+t.getwidth()+"left:"+t.gettextpos().getxposition()+"pt;top:"+t.gettextpos().getyposition()+";\">"+t.getcharacter()+"</span>"); b=a; } else { b=a; bw.write("<br>"+"<span style=\"font-size:"+t.getfontsizeinpt()+"pt;"+ "fontfamily:"+t.getfont().getbasefont()+ "width:"+t.getwidth()+"left:"+t.gettextpos().getxposition()+"pt;top:"+t.gettextpos().getyposition()+";\">"+t.getcharacter()+"</span>"); } return b; }
Comments
Post a Comment