数据读取与归一化
This commit is contained in:
parent
f9d103f6a2
commit
fdea6dafd4
26
pom.xml
26
pom.xml
|
|
@ -19,8 +19,34 @@
|
|||
<maven.compiler.target>21</maven.compiler.target>
|
||||
<maven.compiler.compilerVersion>21</maven.compiler.compilerVersion>
|
||||
<failOnMissingWebXml>false</failOnMissingWebXml>
|
||||
<dl4j-master.version>1.0.0-M2.1</dl4j-master.version>
|
||||
<nd4j.backend>nd4j-native</nd4j.backend>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.nd4j</groupId>
|
||||
<artifactId>${nd4j.backend}</artifactId>
|
||||
<version>${dl4j-master.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-csv</artifactId>
|
||||
<version>1.10.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.30</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ package io.lroyia;
|
|||
* @author lroyia
|
||||
* @since 2023/10/20 9:33
|
||||
**/
|
||||
public class ApplicationRun {
|
||||
public class KnnRun {
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
package io.lroyia.bean;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 主体分类计算转换bean
|
||||
* @author lroyia
|
||||
* @since 2023/10/24 9:48
|
||||
**/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class EntCalInfo implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/**
|
||||
* 主体id
|
||||
*/
|
||||
private String pripid;
|
||||
|
||||
/**
|
||||
* 企业名称
|
||||
*/
|
||||
private String entName;
|
||||
|
||||
/**
|
||||
* 成立时间
|
||||
*/
|
||||
private double estDate;
|
||||
|
||||
/**
|
||||
* 企业类型
|
||||
*/
|
||||
private double entType;
|
||||
|
||||
/**
|
||||
* 注册金额
|
||||
*/
|
||||
private double regCap;
|
||||
|
||||
/**
|
||||
* 行业门类
|
||||
*/
|
||||
private String industryPhy;
|
||||
|
||||
/**
|
||||
* 行业编码
|
||||
*/
|
||||
private double industryCo;
|
||||
|
||||
/**
|
||||
* 登记状态
|
||||
*/
|
||||
private double regState;
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
package io.lroyia.bean;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.experimental.Accessors;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.math.BigDecimal;
|
||||
import java.time.LocalDate;
|
||||
|
||||
/**
|
||||
* 主体信息
|
||||
*
|
||||
* @author lroyia
|
||||
* @since 2023/10/24 9:19
|
||||
**/
|
||||
@Data
|
||||
@Accessors(chain = true)
|
||||
public class EntInfo implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/**
|
||||
* 主体id
|
||||
*/
|
||||
private String pripid;
|
||||
|
||||
/**
|
||||
* 企业名称
|
||||
*/
|
||||
private String entName;
|
||||
|
||||
/**
|
||||
* 成立日期
|
||||
*/
|
||||
private LocalDate estDate;
|
||||
|
||||
/**
|
||||
* 企业类型
|
||||
*/
|
||||
private String entType;
|
||||
|
||||
/**
|
||||
* 注册金额
|
||||
*/
|
||||
private BigDecimal regCap;
|
||||
|
||||
/**
|
||||
* 行业门类
|
||||
*/
|
||||
private String industryPhy;
|
||||
|
||||
/**
|
||||
* 行业编码
|
||||
*/
|
||||
private String industryCo;
|
||||
|
||||
/**
|
||||
* 登记状态
|
||||
*/
|
||||
private String regState;
|
||||
|
||||
/**
|
||||
* 转计算Bean
|
||||
*
|
||||
* @return 转换结果
|
||||
* @author lroyia
|
||||
* @since 2023年10月24日 09:54:11
|
||||
*/
|
||||
public EntCalInfo toCalInfo() {
|
||||
EntCalInfo result = new EntCalInfo();
|
||||
result.setPripid(pripid);
|
||||
result.setEntName(entName);
|
||||
result.setIndustryPhy(industryPhy);
|
||||
if (estDate != null) {
|
||||
LocalDate now = LocalDate.now();
|
||||
result.setEstDate(now.getYear() - estDate.getYear());
|
||||
}
|
||||
if (StringUtils.isNotBlank(entType)) {
|
||||
result.setEntType(Double.parseDouble(entType));
|
||||
}
|
||||
if (regCap != null) {
|
||||
result.setRegCap(regCap.doubleValue());
|
||||
}
|
||||
if (StringUtils.isNotBlank(industryCo)) {
|
||||
result.setIndustryCo(Double.parseDouble(industryCo));
|
||||
}
|
||||
if (StringUtils.isNotBlank(regState)) {
|
||||
result.setRegState(Double.parseDouble(regState));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
package io.lroyia.util;
|
||||
|
||||
import io.lroyia.bean.EntCalInfo;
|
||||
import io.lroyia.bean.EntInfo;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVParser;
|
||||
import org.apache.commons.csv.CSVRecord;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.math.BigDecimal;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 数据操作工具
|
||||
*
|
||||
* @author lroyia
|
||||
* @since 2023/10/24 9:26
|
||||
**/
|
||||
public abstract class DataUtil {
|
||||
|
||||
/**
|
||||
* 获取所有测试数据
|
||||
*
|
||||
* @return 测试数据
|
||||
* @author lroyia
|
||||
* @since 2023年10月24日 09:43:48
|
||||
*/
|
||||
public static List<EntInfo> getAllTestEntInfo() {
|
||||
CSVFormat format = CSVFormat.Builder.create()
|
||||
.setHeader() // 读取header作为csv的key,否则CSVRecord.get(headerName)会报错
|
||||
.setSkipHeaderRecord(true) // 跳过第一行的列名,列名单独是文件的自行搜索CSVFormat构造
|
||||
.build();
|
||||
try (InputStream resourceAsStream = DataUtil.class.getResourceAsStream("/市场主体测试数据.csv");
|
||||
InputStreamReader reader = new InputStreamReader(resourceAsStream)) {
|
||||
CSVParser parse = format.parse(reader);
|
||||
List<EntInfo> result = new ArrayList<>();
|
||||
for (CSVRecord each : parse) {
|
||||
EntInfo atom = new EntInfo();
|
||||
result.add(atom);
|
||||
atom.setPripid(each.get("PRIPID"));
|
||||
atom.setEntName(each.get("ENTNAME"));
|
||||
String estDateStr = each.get("ESTDATE");
|
||||
if (StringUtils.isNotBlank(estDateStr)) {
|
||||
String[] dateArr = estDateStr.split(" ")[0].split("-");
|
||||
atom.setEstDate(LocalDate.of(Integer.parseInt(dateArr[0]), Integer.parseInt(dateArr[1]), Integer.parseInt(dateArr[2])));
|
||||
}
|
||||
String entType = each.get("SUBENTTYPE");
|
||||
if (StringUtils.isBlank(entType)) {
|
||||
entType = each.get("ENTTYPE");
|
||||
}
|
||||
atom.setEntType(entType);
|
||||
String regCap = each.get("REGCAP");
|
||||
if (StringUtils.isNotBlank(regCap)) {
|
||||
atom.setRegCap(new BigDecimal(regCap));
|
||||
}
|
||||
atom.setIndustryPhy(each.get("INDUSTRYPHY"));
|
||||
atom.setIndustryCo(each.get("INDUSTRYCO"));
|
||||
atom.setRegState(each.get("ENTSTATE"));
|
||||
}
|
||||
return result;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取所有计算Bean
|
||||
*
|
||||
* @return 计算Bean清单
|
||||
* @author lroyia
|
||||
* @since 2023年10月24日 09:57:15
|
||||
*/
|
||||
public static List<EntCalInfo> getAllCalInfo() {
|
||||
List<EntInfo> allTestEntInfo = getAllTestEntInfo();
|
||||
List<EntCalInfo> result = new ArrayList<>();
|
||||
for (EntInfo each : allTestEntInfo) {
|
||||
result.add(each.toCalInfo());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 归一
|
||||
*
|
||||
* @param list list
|
||||
* @return 归一结果
|
||||
* @author lroyia
|
||||
* @since 2023年10月24日 10:36:08
|
||||
*/
|
||||
public static List<EntCalInfo> toOne(List<EntCalInfo> list) {
|
||||
List<Double> estDateList = new ArrayList<>(list.size());
|
||||
List<Double> entTypeList = new ArrayList<>(list.size());
|
||||
List<Double> regCapList = new ArrayList<>(list.size());
|
||||
List<Double> industryCoList = new ArrayList<>(list.size());
|
||||
List<Double> regStateList = new ArrayList<>(list.size());
|
||||
for (EntCalInfo each : list) {
|
||||
estDateList.add(each.getEstDate());
|
||||
entTypeList.add(each.getEntType());
|
||||
regCapList.add(each.getRegCap());
|
||||
industryCoList.add(each.getIndustryCo());
|
||||
regStateList.add(each.getRegState());
|
||||
}
|
||||
estDateList = columnToOne(estDateList);
|
||||
entTypeList = columnToOne(entTypeList);
|
||||
regCapList = columnToOne(regCapList);
|
||||
industryCoList = columnToOne(industryCoList);
|
||||
regStateList = columnToOne(regStateList);
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
EntCalInfo each = list.get(i);
|
||||
each.setEstDate(estDateList.get(i))
|
||||
.setEntType(entTypeList.get(i))
|
||||
.setRegCap(regCapList.get(i))
|
||||
.setIndustryCo(industryCoList.get(i))
|
||||
.setRegState(regStateList.get(i));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* 列归一
|
||||
*
|
||||
* @param list 列数据
|
||||
* @return 归一
|
||||
* @author lroyia
|
||||
* @since 2023年10月24日 10:27:05
|
||||
*/
|
||||
private static List<Double> columnToOne(List<Double> list) {
|
||||
double max = Double.MIN_VALUE;
|
||||
double min = Double.MAX_VALUE;
|
||||
for (Double each : list) {
|
||||
max = Math.max(each, max);
|
||||
min = Math.min(each, min);
|
||||
}
|
||||
double divisor = max - min;
|
||||
List<Double> result = new ArrayList<>(list.size());
|
||||
for (Double each : list) {
|
||||
result.add((each - min) / divisor);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
List<EntCalInfo> one = toOne(getAllCalInfo());
|
||||
for (EntCalInfo each : one) {
|
||||
System.out.println(each);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Can't render this file because it is too large.
|
Loading…
Reference in New Issue