Site Overlay

大数据学习01-数据预处理与提取

文章热度: 0 热度

目 录

20190701作业 2

练习 2

需求一:去除日志中的重复行 2

需求二:对每一行工资求平均值 2

需求三:求:所有数据,工资的平均值 5

提高:需求四:求不同城市,工资的平均值 5

提高:需求五:求不同岗位,工资的平均值 6

提高more:需求六:求不同城市、不同岗位工资的平均值 6

20190701作业

练习

需求一:去除日志中的重复行

我们看到,日志中有很多重复行,我们需要把重复的行去除掉。

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class main {
public static void main(String[] args) throws IOException {
BufferedReader bufferedReader = new BufferedReader(new FileReader(“D:\\hello.txt”));
String str;
Set<String> items = new HashSet<String>();
while ((str = bufferedReader.readLine()) != null)
{
items.add(str);
}
for(String txt :items) System.out.println(txt);
}
}

需求二:对每一行工资求平均值

日志中每一行,有一个字段用来表示工资,工资是一个范围,需要从范围中,取一个中间值,来作为真正的工资。

但是不同行的工资,单位或者表示方式不一样,还需要对不同的表示进行处理。

public Map<Integer,Jobs> insertData(Set<String> jobs)
{
Map<Integer,Jobs> jobsMap = new HashMap<Integer, Jobs>();
for(String txt :jobs)
{
//System.out.println(txt);
Jobs job = new Jobs();
String item[] = txt.split(“,”);
if (item.length != 8) continue;
try{
job.setNumber(Integer.parseInt(item[0]));
}catch (NumberFormatException e)
{
job.setNumber(1);
}
job.setJobName(item[1]);
job.setCompany(item[2]);
job.setRequest(item[3]);
job.setSalary(item[4]);
job.setNetwork(item[5]);
job.setLocation(item[6]);
job.setUnknown(item[7]);
//求平均工资
if (job.getSalary().trim().contains(“面议”))
{
job.setAvgSalary(0);
}
else{
String matcher = “\\d+(.)?(\\d+)?[-|至]\\d+(.)?(\\d+)?(\\D+)?”;
Pattern p = Pattern.compile(matcher);
Matcher m = p.matcher(job.getSalary());

if (m.find()) {
//System.out.println(job.getSalary().substring(m.start(),m.end()));
String temp = job.getSalary().substring(m.start(),m.end()).replaceAll(“[^.|0-9]”,“,”);
//System.out.println(temp);
String temp1[] = temp.split(“,”);
float a=0,sum=0;
int count=0;
for(String aa : temp1)
{
try{
a=Float.parseFloat(aa);
count++;
}catch (NumberFormatException e) {

}finally {
sum+=a;
}
}
if (job.getSalary().contains(“年”)) sum = sum/12;
if (job.getSalary().contains(“万”)) sum = sum*10000;
if (job.getSalary().contains(“K”)) sum = sum*1000;
job.setAvgSalary(sum/count);
//System.out.println(job.getAvgSalary());
}else
{
matcher = “\\d+(.)?(\\d+)?(\\D+)?”;
p = Pattern.compile(matcher);
m = p.matcher(job.getSalary());
if (m.find()) {
//System.out.println(job.getSalary().substring(m.start(),m.end()));
String temp = job.getSalary().substring(m.start(), m.end()).replaceAll(“[^.|0-9]”, “,”);
//System.out.println(temp);
String temp1[] = temp.split(“,”);
float a = 0, sum = 0;
int count = 0;
for (String aa : temp1) {
try {
a = Float.parseFloat(aa);
count++;
} catch (NumberFormatException e) {

} finally {
sum += a;
}
}
if (job.getSalary().contains(“年”)) sum = sum / 12;
if (job.getSalary().contains(“万”)) sum = sum * 10000;
if (job.getSalary().contains(“K”)) sum = sum * 1000;
if (job.getSalary().contains(“天”)) sum = sum * 30;
job.setAvgSalary(sum / count);
//System.out.println(job.getAvgSalary());
}
else
{
System.out.println(job.getSalary());
System.out.println(job.getNumber());
}
}
}
jobsMap.put(job.getNumber(),job);
}
return jobsMap;
}

需求三:求:所有数据,工资的平均值

public double culculateAverageSalaries(Map<Integer,Jobs> jm)
{
double avg=0;
for (Jobs job : jm.values())
{
avg += job.getAvgSalary();
}
return avg / jm.size();
}

提高:需求四:求不同城市,工资的平均值

public Map<String,Float> culculateAverageSalaries_City(Map<Integer,Jobs> jm)
{
Map<String, ArrayList<Float>> result = new HashMap<>();
for (Jobs job : jm.values())
{
if (result.containsKey(job.getLocation()))
{
ArrayList<Float> temp = result.get(job.getLocation());
temp.add(job.getAvgSalary());
result.put(job.getLocation(),temp);
}
else
{
ArrayList<Float> temp = new ArrayList<>();
temp.add(job.getAvgSalary());
result.put(job.getLocation(),temp);
}
}
Map<String , Float> return_result = new HashMap<>();
for (String keyname : result.keySet())
{
float tempavg=0;
for (int i=0;i<result.get(keyname).size();i++)
{
tempavg += result.get(keyname).get(i) / result.get(keyname).size();
}
return_result.put(keyname,tempavg);
}
return return_result;
}

提高:需求五:求不同岗位,工资的平均值

public Map<String,Float> culculateAverageSalaries_job(Map<Integer,Jobs> jm)
{
Map<String, ArrayList<Float>> result = new HashMap<>();
for (Jobs job : jm.values())
{
if (result.containsKey(job.getJobName()))
{
ArrayList<Float> temp = result.get(job.getJobName());
temp.add(job.getAvgSalary());
result.put(job.getJobName(),temp);
}
else
{
ArrayList<Float> temp = new ArrayList<>();
temp.add(job.getAvgSalary());
result.put(job.getJobName(),temp);
}
}
Map<String , Float> return_result = new HashMap<>();
for (String keyname : result.keySet())
{
float tempavg=0;
for (int i=0;i<result.get(keyname).size();i++)
{
tempavg += result.get(keyname).get(i) / result.get(keyname).size();
}
return_result.put(keyname,tempavg);
}
return return_result;
}

提高more:需求六:求不同城市、不同岗位工资的平均值

举例:

北京 大数据开发 20000

北京 算法工程师 30000

北京 大数据架构 35000

上海 大数据开发 20000

上海 算法工程师 25000

上海 大数据架构 30000

public Map<String[],Float> culculateAverageSalaries_job_city(Map<Integer,Jobs> jm)
{
Map<String[],ArrayList<Float>> result = new HashMap<>();

for (Jobs job:jm.values())
{
String[] str = new String[2];
str[0] = job.getLocation();
str[1] = job.getJobName();
if(result.containsKey(str))
{
//System.out.println(“a”);
ArrayList<Float> temp = result.get(str);
temp.add(job.getAvgSalary());
result.put(str,temp);
}
else
{
//System.out.println(“7”);
ArrayList<Float> temp = new ArrayList<>();
temp.add(job.getAvgSalary());
result.put(str,temp);
}
}
Map<String[],Float> return_result = new HashMap<>();
for (String[] keyname : result.keySet())
{

float tempavg=0;
for (int i=0;i<result.get(keyname).size();i++)
{
tempavg += result.get(keyname).get(i) / result.get(keyname).size();
}
return_result.put(keyname,tempavg);
}
return return_result;
}

2+

说点什么

200
  Subscribe  
提醒