in this program, i have given wikipeadia URL for text extraction logic but after extraction of text for loops are taking to much time to execute.
the same logic too fast in python program.
how to reduces execution time ?
the same logic too fast in python program.
how to reduces execution time ?
Code:
import java.io.IOException; import java.net.URL; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TextExtraction1 { static TextExtraction1 fj; public String toHtmlString(String url) throws IOException { StringBuilder sb = new StringBuilder(); for(Scanner sc = new Scanner(new URL(url).openStream()); sc.hasNext(); ) sb.append(sc.nextLine()).append('\n'); return sb.toString(); } static int search(String key,String target) { int count=0; Pattern p=Pattern.compile(key); Matcher m=p.matcher(target); while(m.find()){count++;} return count; } String extractText(String s) throws IOException { String h1 = fj.toHtmlString(s); System.out.println("extracted \n\n"); int i2=0; String h2[] = h1.split("\n"); String html=""; long start = System.currentTimeMillis(); for(String h3:h2) { //bw.write(h3);bw.newLine(); html += h3; html += ""; //iu=iu+1; } long end = System.currentTimeMillis(); System.out.println(++i2+" th loop end in "+(end-start)/1000+" seconds"); boolean capture = true; String filtered_text = ""; String html_text[] = html.split("<"); String h_text[];//System.out.println("kyhe1"); start = System.currentTimeMillis(); for(String h:html_text) { h = "<" + h; h_text = h.split(">"); for(String w :h_text) { if(w.length()>0) { if(w.substring(0, 1).equals("<")){w +=">";} } if(search("</script>",w)>0){capture=true;} else if(search("<script",w)>0){capture=false;} else if(capture){filtered_text += w; filtered_text += "\n";} } } // System.out.println("kyhe1"); end = System.currentTimeMillis(); html_text = filtered_text.split("\n"); System.out.println(++i2+" th loop end in "+(end-start)/1000+" seconds"); return html_text[0]; } public static void main(String []args)throws IOException { fj = new TextExtraction1(); System.out.println(fj.extractText("https://en.wikipedia.org/wiki/Varanasi")); } }
Comment