13 Nisan 2019 Cumartesi

Apache Spark txt dosyası en çok kullanılan kelimeler

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.util.Arrays;

public class Main {
 public static void main(String[] args) 
 {  
  SparkConf conf = new SparkConf().setAppName("wordCounter").setMaster("local[*]");
  JavaSparkContext sc = new JavaSparkContext(conf);
  JavaRDD<String> inputData = sc.textFile("src/main/resources/input.txt");
  inputData.map(value -> value.replaceAll("[^a-zA-z\\s]", ""))
    .flatMap(value -> Arrays.asList(value.split(" ")).iterator())
    .filter(word -> word.length() > 1)
    .mapToPair(value -> new Tuple2<String,Long>( value, 1L))
    .reduceByKey((value1, value2) -> value1 + value2)
    .mapToPair(value -> new Tuple2<Long,String>(value._2, value._1)).sortByKey(false,1)
    .foreach(w->System.out.println(w));

  sc.close();
 }
}
Örnek Çıktı : (2509,to) (2011,that) (1552,and) (1420,of) (1287,this) (1246,we) (1243,is) (1098,you) (955,in) (833,on) (779,it) (713,container) (683,So) (670,do) (656,be) (654,for) (617,can) (610,have) (576,And) (552,Docker) (524,going) (480,if) (466,just) (456,its) (441,here) (434,will) (422,but) (414,as) (394,image) (392,now) (365,running) (354,with) (353,need) ..... ..... (1,Gerty) (1,Opt) (1,maths) (1,construct) (1,statement) (1,Course) (1,forwarding) (1,episode) (1,catching) (1,angry) (1,Target) (1,favour)

0 yorum: