数据读取与归一化

This commit is contained in:
黎润豪 2023-10-24 10:43:53 +08:00
parent f9d103f6a2
commit fdea6dafd4
6 changed files with 331 additions and 1 deletions

26
pom.xml
View File

@ -19,8 +19,34 @@
<maven.compiler.target>21</maven.compiler.target>
<maven.compiler.compilerVersion>21</maven.compiler.compilerVersion>
<failOnMissingWebXml>false</failOnMissingWebXml>
<dl4j-master.version>1.0.0-M2.1</dl4j-master.version>
<nd4j.backend>nd4j-native</nd4j.backend>
</properties>
<dependencies>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>${nd4j.backend}</artifactId>
<version>${dl4j-master.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>

View File

@ -6,7 +6,7 @@ package io.lroyia;
* @author lroyia
* @since 2023/10/20 9:33
**/
public class ApplicationRun {
public class KnnRun {
public static void main(String[] args) {

View File

@ -0,0 +1,58 @@
package io.lroyia.bean;
import lombok.Data;
import lombok.experimental.Accessors;
import java.io.Serializable;
/**
* 主体分类计算转换bean
* @author lroyia
* @since 2023/10/24 9:48
**/
@Data
@Accessors(chain = true)
public class EntCalInfo implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 主体id
*/
private String pripid;
/**
* 企业名称
*/
private String entName;
/**
* 成立时间
*/
private double estDate;
/**
* 企业类型
*/
private double entType;
/**
* 注册金额
*/
private double regCap;
/**
* 行业门类
*/
private String industryPhy;
/**
* 行业编码
*/
private double industryCo;
/**
* 登记状态
*/
private double regState;
}

View File

@ -0,0 +1,93 @@
package io.lroyia.bean;
import lombok.Data;
import lombok.experimental.Accessors;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.math.BigDecimal;
import java.time.LocalDate;
/**
* 主体信息
*
* @author lroyia
* @since 2023/10/24 9:19
**/
@Data
@Accessors(chain = true)
public class EntInfo implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 主体id
*/
private String pripid;
/**
* 企业名称
*/
private String entName;
/**
* 成立日期
*/
private LocalDate estDate;
/**
* 企业类型
*/
private String entType;
/**
* 注册金额
*/
private BigDecimal regCap;
/**
* 行业门类
*/
private String industryPhy;
/**
* 行业编码
*/
private String industryCo;
/**
* 登记状态
*/
private String regState;
/**
* 转计算Bean
*
* @return 转换结果
* @author lroyia
* @since 2023年10月24日 09:54:11
*/
public EntCalInfo toCalInfo() {
EntCalInfo result = new EntCalInfo();
result.setPripid(pripid);
result.setEntName(entName);
result.setIndustryPhy(industryPhy);
if (estDate != null) {
LocalDate now = LocalDate.now();
result.setEstDate(now.getYear() - estDate.getYear());
}
if (StringUtils.isNotBlank(entType)) {
result.setEntType(Double.parseDouble(entType));
}
if (regCap != null) {
result.setRegCap(regCap.doubleValue());
}
if (StringUtils.isNotBlank(industryCo)) {
result.setIndustryCo(Double.parseDouble(industryCo));
}
if (StringUtils.isNotBlank(regState)) {
result.setRegState(Double.parseDouble(regState));
}
return result;
}
}

View File

@ -0,0 +1,153 @@
package io.lroyia.util;
import io.lroyia.bean.EntCalInfo;
import io.lroyia.bean.EntInfo;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
/**
* 数据操作工具
*
* @author lroyia
* @since 2023/10/24 9:26
**/
public abstract class DataUtil {
/**
* 获取所有测试数据
*
* @return 测试数据
* @author lroyia
* @since 2023年10月24日 09:43:48
*/
public static List<EntInfo> getAllTestEntInfo() {
CSVFormat format = CSVFormat.Builder.create()
.setHeader() // 读取header作为csv的key否则CSVRecord.get(headerName)会报错
.setSkipHeaderRecord(true) // 跳过第一行的列名列名单独是文件的自行搜索CSVFormat构造
.build();
try (InputStream resourceAsStream = DataUtil.class.getResourceAsStream("/市场主体测试数据.csv");
InputStreamReader reader = new InputStreamReader(resourceAsStream)) {
CSVParser parse = format.parse(reader);
List<EntInfo> result = new ArrayList<>();
for (CSVRecord each : parse) {
EntInfo atom = new EntInfo();
result.add(atom);
atom.setPripid(each.get("PRIPID"));
atom.setEntName(each.get("ENTNAME"));
String estDateStr = each.get("ESTDATE");
if (StringUtils.isNotBlank(estDateStr)) {
String[] dateArr = estDateStr.split(" ")[0].split("-");
atom.setEstDate(LocalDate.of(Integer.parseInt(dateArr[0]), Integer.parseInt(dateArr[1]), Integer.parseInt(dateArr[2])));
}
String entType = each.get("SUBENTTYPE");
if (StringUtils.isBlank(entType)) {
entType = each.get("ENTTYPE");
}
atom.setEntType(entType);
String regCap = each.get("REGCAP");
if (StringUtils.isNotBlank(regCap)) {
atom.setRegCap(new BigDecimal(regCap));
}
atom.setIndustryPhy(each.get("INDUSTRYPHY"));
atom.setIndustryCo(each.get("INDUSTRYCO"));
atom.setRegState(each.get("ENTSTATE"));
}
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 获取所有计算Bean
*
* @return 计算Bean清单
* @author lroyia
* @since 2023年10月24日 09:57:15
*/
public static List<EntCalInfo> getAllCalInfo() {
List<EntInfo> allTestEntInfo = getAllTestEntInfo();
List<EntCalInfo> result = new ArrayList<>();
for (EntInfo each : allTestEntInfo) {
result.add(each.toCalInfo());
}
return result;
}
/**
* 归一
*
* @param list list
* @return 归一结果
* @author lroyia
* @since 2023年10月24日 10:36:08
*/
public static List<EntCalInfo> toOne(List<EntCalInfo> list) {
List<Double> estDateList = new ArrayList<>(list.size());
List<Double> entTypeList = new ArrayList<>(list.size());
List<Double> regCapList = new ArrayList<>(list.size());
List<Double> industryCoList = new ArrayList<>(list.size());
List<Double> regStateList = new ArrayList<>(list.size());
for (EntCalInfo each : list) {
estDateList.add(each.getEstDate());
entTypeList.add(each.getEntType());
regCapList.add(each.getRegCap());
industryCoList.add(each.getIndustryCo());
regStateList.add(each.getRegState());
}
estDateList = columnToOne(estDateList);
entTypeList = columnToOne(entTypeList);
regCapList = columnToOne(regCapList);
industryCoList = columnToOne(industryCoList);
regStateList = columnToOne(regStateList);
for (int i = 0; i < list.size(); i++) {
EntCalInfo each = list.get(i);
each.setEstDate(estDateList.get(i))
.setEntType(entTypeList.get(i))
.setRegCap(regCapList.get(i))
.setIndustryCo(industryCoList.get(i))
.setRegState(regStateList.get(i));
}
return list;
}
/**
* 列归一
*
* @param list 列数据
* @return 归一
* @author lroyia
* @since 2023年10月24日 10:27:05
*/
private static List<Double> columnToOne(List<Double> list) {
double max = Double.MIN_VALUE;
double min = Double.MAX_VALUE;
for (Double each : list) {
max = Math.max(each, max);
min = Math.min(each, min);
}
double divisor = max - min;
List<Double> result = new ArrayList<>(list.size());
for (Double each : list) {
result.add((each - min) / divisor);
}
return result;
}
public static void main(String[] args) {
List<EntCalInfo> one = toOne(getAllCalInfo());
for (EntCalInfo each : one) {
System.out.println(each);
}
}
}

View File

Can't render this file because it is too large.