其他公告静态数据提取
This commit is contained in:
parent
ecc81e5bf7
commit
a3c6b87338
|
|
@ -0,0 +1,286 @@
|
|||
package io.lroyia;
|
||||
|
||||
import io.lroyia.entity.NameList;
|
||||
import io.lroyia.entity.PenaltyNoticeInfoForm;
|
||||
import io.lroyia.util.StringUtils;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVPrinter;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* ftl提取csv,thymeleaf,jsp等html模版亦适用
|
||||
* @author <a href="https://blog.lroyia.top">lroyia</a>
|
||||
* @since 2021/7/15 9:34
|
||||
**/
|
||||
public class Ftl2CsvOt1 {
|
||||
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// 读列表
|
||||
Map<String, PenaltyNoticeInfoForm> list = readListFtl();
|
||||
// 读详情
|
||||
Map<String, PenaltyNoticeInfoForm> details = readDetailFtl();
|
||||
List<NameList> nameLists = new ArrayList<>();
|
||||
list.forEach((k,v) ->{
|
||||
PenaltyNoticeInfoForm each = details.get(k);
|
||||
v.setNoticeId(UUID.randomUUID().toString());
|
||||
if(each != null){
|
||||
String nameList = each.getNameList();
|
||||
v.setNameList(nameList);
|
||||
v.setContent(each.getContent());
|
||||
v.setNoticeFileName(each.getNoticeFileName());
|
||||
v.setNoticeFilePath(each.getNoticeFilePath());
|
||||
// 处理名单的html信息
|
||||
if(StringUtils.isNotBlank(nameList)){
|
||||
String type = each.getContent() != null && !each.getContent().contains("拟吊销") ? "吊销" : "拟吊销";
|
||||
Document doc = Jsoup.parse("<table>"+nameList+"</table>");
|
||||
Elements trs = doc.getElementsByTag("tr");
|
||||
Element header = trs.get(0);
|
||||
List<String> headerList = header.getElementsByTag("td").stream().map(h -> h.text().trim()).collect(Collectors.toList());
|
||||
for (int i = 1; i < trs.size(); i++) {
|
||||
NameList atom = new NameList();
|
||||
atom.setNameListType(type);
|
||||
atom.setItemId(UUID.randomUUID().toString());
|
||||
atom.setNoticeId(v.getNoticeId());
|
||||
nameLists.add(atom);
|
||||
Element tr = trs.get(i);
|
||||
Elements tds = tr.getElementsByTag("td");
|
||||
for (int j = 0; j < tds.size(); j++) {
|
||||
String content = tds.get(j).text().trim();
|
||||
String headerName = headerList.size() <= j ? StringUtils.EMPTY : headerList.get(j);
|
||||
if("序号".equals(headerName)) {
|
||||
continue;
|
||||
}else if(headerName.equals("企业名称") || "当事人".contains(headerName) || "名称".equals(headerName)){
|
||||
atom.setEntName(content);
|
||||
}else if(headerName.contains("注册号") || headerName.contains("统一")){
|
||||
atom.setRegNo(content);
|
||||
}else if(headerName.contains("法人") || headerName.contains("法定代表人") || headerName.contains("负责人")){
|
||||
atom.setLeRep(content);
|
||||
}else if(headerName.contains("地址") || headerName.equals("住所")){
|
||||
atom.setDom(content);
|
||||
}else if(headerName.equals("主体类型") || headerName.equals("企业类型")){
|
||||
atom.setEntType(content);
|
||||
}else if(headerName.contains("电话")){
|
||||
atom.setTel(content);
|
||||
}else if(headerName.equals("经营范围")){
|
||||
atom.setOpScope(content);
|
||||
}else if(headerName.contains("书号")){
|
||||
atom.setWritNo(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 创建列表csv文件
|
||||
FileWriter writer = new FileWriter("F:/AFFICHE_BASE92.csv");
|
||||
String[] listHeader = new String[]{"NOTICEID", "NOTICETITLE", "NOTICENO", "NOTICECONTENT",
|
||||
"JUDAUTH", "JUDAUTH_CN", "JUDDATE", "NOTICEDATE", "NOTICETYPE", "S_EXT_FROMNODE", "S_EXT_DATATIME", "dstate"};
|
||||
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(listHeader));
|
||||
String[] listHeader1 = new String[]{"BOOKID", "NOTICEID", "FILENAME", "REMARK",
|
||||
"S_EXT_FROMNODE", "S_EXT_DATATIME", "dstate"};
|
||||
FileWriter writer1 = new FileWriter("F:/AFFICHE_CASEINFO_NDOC92.csv");
|
||||
CSVPrinter csvPrinter1 = new CSVPrinter(writer1, CSVFormat.DEFAULT.withHeader(listHeader1));
|
||||
List<List<String>> result = new ArrayList<>();
|
||||
List<List<String>> result1 = new ArrayList<>();
|
||||
for (PenaltyNoticeInfoForm value : list.values()) {
|
||||
List<String> resultSet = new ArrayList<>();
|
||||
resultSet.add(value.getNoticeId());
|
||||
resultSet.add(value.getNoticeTitle());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(value.getContent());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(value.getDecOrgCn());
|
||||
String date = formatDate(value.getNoticeDate(), "yyyy-MM-dd HH:mm:ss");
|
||||
resultSet.add(date);
|
||||
resultSet.add(date);
|
||||
resultSet.add("92");
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(date);
|
||||
resultSet.add("0");
|
||||
result.add(resultSet);
|
||||
if(StringUtils.isNotBlank(value.getNoticeFileName())){
|
||||
List<String> bookResultSet = new ArrayList<>();
|
||||
bookResultSet.add(UUID.randomUUID().toString());
|
||||
bookResultSet.add(value.getNoticeId());
|
||||
bookResultSet.add(value.getNoticeFileName());
|
||||
bookResultSet.add(StringUtils.EMPTY);
|
||||
bookResultSet.add(StringUtils.EMPTY);
|
||||
bookResultSet.add(date);
|
||||
bookResultSet.add("0");
|
||||
result1.add(bookResultSet);
|
||||
}
|
||||
}
|
||||
csvPrinter.printRecords(result);
|
||||
writer.close();
|
||||
csvPrinter1.printRecords(result1);
|
||||
writer1.close();
|
||||
|
||||
// 创建名单csv
|
||||
writer = new FileWriter("F:/AFFICHE_CASEINFO92.csv");
|
||||
listHeader = new String[]{"CASEID", "NOTICEID", "ENTNAME", "REGNO", "UNISCID", "LEREP",
|
||||
"ENTTYPE", "OPSCOPE", "TEL", "DOM", "PENDECNO", "PENDECNAME", "NAMELISTTYPE",
|
||||
"S_EXT_FROMNODE", "S_EXT_DATATIME"};
|
||||
csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(listHeader));
|
||||
result = new ArrayList<>();
|
||||
for (NameList each : nameLists) {
|
||||
List<String> resultSet = new ArrayList<>();
|
||||
resultSet.add(each.getItemId());
|
||||
resultSet.add(each.getNoticeId());
|
||||
resultSet.add(each.getEntName());
|
||||
resultSet.add(each.getRegNo());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(each.getLeRep());
|
||||
resultSet.add(each.getEntType());
|
||||
resultSet.add(each.getOpScope());
|
||||
resultSet.add(each.getTel());
|
||||
resultSet.add(each.getDom());
|
||||
resultSet.add(each.getWritNo());
|
||||
resultSet.add(each.getWritNo());
|
||||
resultSet.add(each.getNameListType());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
result.add(resultSet);
|
||||
}
|
||||
csvPrinter.printRecords(result);
|
||||
writer.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 读取ftl列表页静态中的数据
|
||||
* @return 读取结果
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 20:40:32
|
||||
*/
|
||||
private static Map<String, PenaltyNoticeInfoForm> readListFtl() throws Exception {
|
||||
BufferedReader fileReader = new BufferedReader(new InputStreamReader(Ftl2CsvOt1.class.getClassLoader().getResourceAsStream("PenaltyHearing.ftl")));
|
||||
Document document = Jsoup.parse(fileReader.lines().collect(Collectors.joining()));
|
||||
fileReader.close();
|
||||
Elements trs = document.select("#list tr");
|
||||
Map<String, PenaltyNoticeInfoForm> result = new HashMap<>();
|
||||
for (Element tr : trs) {
|
||||
Elements tds = tr.getElementsByTag("td");
|
||||
Element td1 = tds.get(0);
|
||||
Element td2 = tds.get(1);
|
||||
Element td3 = tds.size() == 4 ? tds.get(3) : tds.get(2);
|
||||
String href = td1.getElementsByTag("a").get(0).attr("href");
|
||||
String id = href.substring(href.indexOf("=") + 1);
|
||||
String title = td1.text().trim();
|
||||
String decOrgCn = td2.text().trim();
|
||||
String dateStr = td3.text().trim();
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setNoticeId(id);
|
||||
atom.setNoticeTitle(title);
|
||||
atom.setDecOrgCn(decOrgCn);
|
||||
dateStr = dateStr.trim();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
/*
|
||||
不知道因为编码原因还是什么原因,调试时无法识别yyyy年MM月dd日这个表达式,
|
||||
同时也因为同时存在07月,7月这样的两种的月日长度,因而先使用正则处理
|
||||
*/
|
||||
Pattern compile = Pattern.compile("\\d+");
|
||||
Matcher matcher = compile.matcher(dateStr);
|
||||
while (matcher.find()){
|
||||
String group = matcher.group();
|
||||
if(group.length() == 1){
|
||||
sb.append("0");
|
||||
}
|
||||
sb.append(group);
|
||||
}
|
||||
if(StringUtils.isBlank(sb.toString())){
|
||||
System.out.println(dateStr);
|
||||
}
|
||||
atom.setNoticeDate(parseDate(sb.toString(), "yyyyMMdd"));
|
||||
|
||||
result.put(atom.getNoticeId(), atom);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取ftl详情页静态中的数据
|
||||
* @return 读取结果
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 20:40:40
|
||||
*/
|
||||
private static Map<String, PenaltyNoticeInfoForm> readDetailFtl() throws Exception {
|
||||
// BufferedReader fileReader = new BufferedReader(new FileReader(DETAIL_FILE_PATH));
|
||||
BufferedReader fileReader = new BufferedReader(new InputStreamReader(Ftl2CsvOt1.class.getClassLoader().getResourceAsStream("PenaltyHearingDetail.ftl")));
|
||||
String replace = fileReader.lines().collect(Collectors.joining())
|
||||
.replace("<#elseif", "</elseif><elseif")
|
||||
.replace("<#if", "<elseif")
|
||||
.replace("</#if>", "</elseif>")
|
||||
.replace("id??&&id", "id")
|
||||
.replace("id==", "id=");
|
||||
fileReader.close();
|
||||
Document document = Jsoup.parse(replace);
|
||||
Map<String, PenaltyNoticeInfoForm> result = new HashMap<>();
|
||||
Elements elements = document.select("elseif[id]");
|
||||
for (Element each : elements) {
|
||||
Elements iframe = each.getElementsByTag("iframe");
|
||||
Elements content = each.select(".content");
|
||||
String id = each.attr("id");
|
||||
if(!content.isEmpty()){
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setContent(content.html().replace("\n", "").replace("\t", ""));
|
||||
atom.setNameList(each.select("table").html());
|
||||
result.put(id, atom);
|
||||
}else if(!iframe.isEmpty()){
|
||||
String src = iframe.attr("src");
|
||||
String path = src.substring(src.indexOf("=")+1);
|
||||
String fileName = path.substring(path.lastIndexOf("/")+1);
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setNoticeFilePath(path);
|
||||
atom.setNoticeFileName(fileName);
|
||||
result.put(id, atom);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 日期格式化
|
||||
* @param dateStr 日期字串
|
||||
* @param format 转换格式
|
||||
* @return 转换结果
|
||||
* @throws Exception 异常
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 21:00:13
|
||||
*/
|
||||
private static Date parseDate(String dateStr, String format) throws Exception{
|
||||
SimpleDateFormat sdf = new SimpleDateFormat(format);
|
||||
return sdf.parse(dateStr);
|
||||
}
|
||||
|
||||
/**
|
||||
* 日期格式化为字符串
|
||||
* @param date 格式化日期
|
||||
* @param format 格式
|
||||
* @return 格式化结果
|
||||
* @throws Exception 异常
|
||||
* @author lroyia
|
||||
* @since 2021年7月15日 17:22:06
|
||||
*/
|
||||
private static String formatDate(Date date, String format) throws Exception{
|
||||
SimpleDateFormat sdf = new SimpleDateFormat(format);
|
||||
return sdf.format(date);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,286 @@
|
|||
package io.lroyia;
|
||||
|
||||
import io.lroyia.entity.NameList;
|
||||
import io.lroyia.entity.PenaltyNoticeInfoForm;
|
||||
import io.lroyia.util.StringUtils;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVPrinter;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* ftl提取csv,thymeleaf,jsp等html模版亦适用
|
||||
* @author <a href="https://blog.lroyia.top">lroyia</a>
|
||||
* @since 2021/7/15 9:34
|
||||
**/
|
||||
public class Ftl2CsvOt2 {
|
||||
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// 读列表
|
||||
Map<String, PenaltyNoticeInfoForm> list = readListFtl();
|
||||
// 读详情
|
||||
Map<String, PenaltyNoticeInfoForm> details = readDetailFtl();
|
||||
List<NameList> nameLists = new ArrayList<>();
|
||||
list.forEach((k,v) ->{
|
||||
PenaltyNoticeInfoForm each = details.get(k);
|
||||
v.setNoticeId(UUID.randomUUID().toString());
|
||||
if(each != null){
|
||||
String nameList = each.getNameList();
|
||||
v.setNameList(nameList);
|
||||
v.setContent(each.getContent());
|
||||
v.setNoticeFileName(each.getNoticeFileName());
|
||||
v.setNoticeFilePath(each.getNoticeFilePath());
|
||||
// 处理名单的html信息
|
||||
if(StringUtils.isNotBlank(nameList)){
|
||||
String type = each.getContent() != null && !each.getContent().contains("拟吊销") ? "吊销" : "拟吊销";
|
||||
Document doc = Jsoup.parse("<table>"+nameList+"</table>");
|
||||
Elements trs = doc.getElementsByTag("tr");
|
||||
Element header = trs.get(0);
|
||||
List<String> headerList = header.getElementsByTag("td").stream().map(h -> h.text().trim()).collect(Collectors.toList());
|
||||
for (int i = 1; i < trs.size(); i++) {
|
||||
NameList atom = new NameList();
|
||||
atom.setNameListType(type);
|
||||
atom.setItemId(UUID.randomUUID().toString());
|
||||
atom.setNoticeId(v.getNoticeId());
|
||||
nameLists.add(atom);
|
||||
Element tr = trs.get(i);
|
||||
Elements tds = tr.getElementsByTag("td");
|
||||
for (int j = 0; j < tds.size(); j++) {
|
||||
String content = tds.get(j).text().trim();
|
||||
String headerName = headerList.size() <= j ? StringUtils.EMPTY : headerList.get(j);
|
||||
if("序号".equals(headerName)) {
|
||||
continue;
|
||||
}else if(headerName.equals("企业名称") || "当事人".contains(headerName) || "名称".equals(headerName)){
|
||||
atom.setEntName(content);
|
||||
}else if(headerName.contains("注册号") || headerName.contains("统一")){
|
||||
atom.setRegNo(content);
|
||||
}else if(headerName.contains("法人") || headerName.contains("法定代表人") || headerName.contains("负责人")){
|
||||
atom.setLeRep(content);
|
||||
}else if(headerName.contains("地址") || headerName.equals("住所")){
|
||||
atom.setDom(content);
|
||||
}else if(headerName.equals("主体类型") || headerName.equals("企业类型")){
|
||||
atom.setEntType(content);
|
||||
}else if(headerName.contains("电话")){
|
||||
atom.setTel(content);
|
||||
}else if(headerName.equals("经营范围")){
|
||||
atom.setOpScope(content);
|
||||
}else if(headerName.contains("书号")){
|
||||
atom.setWritNo(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 创建列表csv文件
|
||||
FileWriter writer = new FileWriter("F:/AFFICHE_BASE93.csv");
|
||||
String[] listHeader = new String[]{"NOTICEID", "NOTICETITLE", "NOTICENO", "NOTICECONTENT",
|
||||
"JUDAUTH", "JUDAUTH_CN", "JUDDATE", "NOTICEDATE", "NOTICETYPE", "S_EXT_FROMNODE", "S_EXT_DATATIME", "dstate"};
|
||||
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(listHeader));
|
||||
String[] listHeader1 = new String[]{"BOOKID", "NOTICEID", "FILENAME", "REMARK",
|
||||
"S_EXT_FROMNODE", "S_EXT_DATATIME", "dstate"};
|
||||
FileWriter writer1 = new FileWriter("F:/AFFICHE_CASEINFO_NDOC93.csv");
|
||||
CSVPrinter csvPrinter1 = new CSVPrinter(writer1, CSVFormat.DEFAULT.withHeader(listHeader1));
|
||||
List<List<String>> result = new ArrayList<>();
|
||||
List<List<String>> result1 = new ArrayList<>();
|
||||
for (PenaltyNoticeInfoForm value : list.values()) {
|
||||
List<String> resultSet = new ArrayList<>();
|
||||
resultSet.add(value.getNoticeId());
|
||||
resultSet.add(value.getNoticeTitle());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(value.getContent());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(value.getDecOrgCn());
|
||||
String date = formatDate(value.getNoticeDate(), "yyyy-MM-dd HH:mm:ss");
|
||||
resultSet.add(date);
|
||||
resultSet.add(date);
|
||||
resultSet.add("93");
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(date);
|
||||
resultSet.add("0");
|
||||
result.add(resultSet);
|
||||
if(StringUtils.isNotBlank(value.getNoticeFileName())){
|
||||
List<String> bookResultSet = new ArrayList<>();
|
||||
bookResultSet.add(UUID.randomUUID().toString());
|
||||
bookResultSet.add(value.getNoticeId());
|
||||
bookResultSet.add(value.getNoticeFileName());
|
||||
bookResultSet.add(StringUtils.EMPTY);
|
||||
bookResultSet.add(StringUtils.EMPTY);
|
||||
bookResultSet.add(date);
|
||||
bookResultSet.add("0");
|
||||
result1.add(bookResultSet);
|
||||
}
|
||||
}
|
||||
csvPrinter.printRecords(result);
|
||||
writer.close();
|
||||
csvPrinter1.printRecords(result1);
|
||||
writer1.close();
|
||||
|
||||
// 创建名单csv
|
||||
writer = new FileWriter("F:/AFFICHE_CASEINFO93.csv");
|
||||
listHeader = new String[]{"CASEID", "NOTICEID", "ENTNAME", "REGNO", "UNISCID", "LEREP",
|
||||
"ENTTYPE", "OPSCOPE", "TEL", "DOM", "PENDECNO", "PENDECNAME", "NAMELISTTYPE",
|
||||
"S_EXT_FROMNODE", "S_EXT_DATATIME"};
|
||||
csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(listHeader));
|
||||
result = new ArrayList<>();
|
||||
for (NameList each : nameLists) {
|
||||
List<String> resultSet = new ArrayList<>();
|
||||
resultSet.add(each.getItemId());
|
||||
resultSet.add(each.getNoticeId());
|
||||
resultSet.add(each.getEntName());
|
||||
resultSet.add(each.getRegNo());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(each.getLeRep());
|
||||
resultSet.add(each.getEntType());
|
||||
resultSet.add(each.getOpScope());
|
||||
resultSet.add(each.getTel());
|
||||
resultSet.add(each.getDom());
|
||||
resultSet.add(each.getWritNo());
|
||||
resultSet.add(each.getWritNo());
|
||||
resultSet.add(each.getNameListType());
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
resultSet.add(StringUtils.EMPTY);
|
||||
result.add(resultSet);
|
||||
}
|
||||
csvPrinter.printRecords(result);
|
||||
writer.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 读取ftl列表页静态中的数据
|
||||
* @return 读取结果
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 20:40:32
|
||||
*/
|
||||
private static Map<String, PenaltyNoticeInfoForm> readListFtl() throws Exception {
|
||||
BufferedReader fileReader = new BufferedReader(new InputStreamReader(Ftl2CsvOt2.class.getClassLoader().getResourceAsStream("AbnormalHearing.ftl")));
|
||||
Document document = Jsoup.parse(fileReader.lines().collect(Collectors.joining()));
|
||||
fileReader.close();
|
||||
Elements trs = document.select("#list tr");
|
||||
Map<String, PenaltyNoticeInfoForm> result = new HashMap<>();
|
||||
for (Element tr : trs) {
|
||||
Elements tds = tr.getElementsByTag("td");
|
||||
Element td1 = tds.get(0);
|
||||
Element td2 = tds.get(1);
|
||||
Element td3 = tds.size() == 4 ? tds.get(3) : tds.get(2);
|
||||
String href = td1.getElementsByTag("a").get(0).attr("href");
|
||||
String id = href.substring(href.indexOf("=") + 1);
|
||||
String title = td1.text().trim();
|
||||
String decOrgCn = td2.text().trim();
|
||||
String dateStr = td3.text().trim();
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setNoticeId(id);
|
||||
atom.setNoticeTitle(title);
|
||||
atom.setDecOrgCn(decOrgCn);
|
||||
dateStr = dateStr.trim();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
/*
|
||||
不知道因为编码原因还是什么原因,调试时无法识别yyyy年MM月dd日这个表达式,
|
||||
同时也因为同时存在07月,7月这样的两种的月日长度,因而先使用正则处理
|
||||
*/
|
||||
Pattern compile = Pattern.compile("\\d+");
|
||||
Matcher matcher = compile.matcher(dateStr);
|
||||
while (matcher.find()){
|
||||
String group = matcher.group();
|
||||
if(group.length() == 1){
|
||||
sb.append("0");
|
||||
}
|
||||
sb.append(group);
|
||||
}
|
||||
if(StringUtils.isBlank(sb.toString())){
|
||||
System.out.println(dateStr);
|
||||
}
|
||||
atom.setNoticeDate(parseDate(sb.toString(), "yyyyMMdd"));
|
||||
|
||||
result.put(atom.getNoticeId(), atom);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取ftl详情页静态中的数据
|
||||
* @return 读取结果
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 20:40:40
|
||||
*/
|
||||
private static Map<String, PenaltyNoticeInfoForm> readDetailFtl() throws Exception {
|
||||
// BufferedReader fileReader = new BufferedReader(new FileReader(DETAIL_FILE_PATH));
|
||||
BufferedReader fileReader = new BufferedReader(new InputStreamReader(Ftl2CsvOt2.class.getClassLoader().getResourceAsStream("AbnormalHearingDetail.ftl")));
|
||||
String replace = fileReader.lines().collect(Collectors.joining())
|
||||
.replace("<#elseif", "</elseif><elseif")
|
||||
.replace("<#if", "<elseif")
|
||||
.replace("</#if>", "</elseif>")
|
||||
.replace("id??&&id", "id")
|
||||
.replace("id==", "id=");
|
||||
fileReader.close();
|
||||
Document document = Jsoup.parse(replace);
|
||||
Map<String, PenaltyNoticeInfoForm> result = new HashMap<>();
|
||||
Elements elements = document.select("elseif[id]");
|
||||
for (Element each : elements) {
|
||||
Elements iframe = each.getElementsByTag("iframe");
|
||||
Elements content = each.select(".content");
|
||||
String id = each.attr("id");
|
||||
if(!content.isEmpty()){
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setContent(content.html().replace("\n", "").replace("\t", ""));
|
||||
atom.setNameList(each.select("table").html());
|
||||
result.put(id, atom);
|
||||
}else if(!iframe.isEmpty()){
|
||||
String src = iframe.attr("src");
|
||||
String path = src.substring(src.indexOf("=")+1);
|
||||
String fileName = path.substring(path.lastIndexOf("/")+1);
|
||||
PenaltyNoticeInfoForm atom = new PenaltyNoticeInfoForm();
|
||||
atom.setNoticeFilePath(path);
|
||||
atom.setNoticeFileName(fileName);
|
||||
result.put(id, atom);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 日期格式化
|
||||
* @param dateStr 日期字串
|
||||
* @param format 转换格式
|
||||
* @return 转换结果
|
||||
* @throws Exception 异常
|
||||
* @author lroyia
|
||||
* @since 2021年7月13日 21:00:13
|
||||
*/
|
||||
private static Date parseDate(String dateStr, String format) throws Exception{
|
||||
SimpleDateFormat sdf = new SimpleDateFormat(format);
|
||||
return sdf.parse(dateStr);
|
||||
}
|
||||
|
||||
/**
|
||||
* 日期格式化为字符串
|
||||
* @param date 格式化日期
|
||||
* @param format 格式
|
||||
* @return 格式化结果
|
||||
* @throws Exception 异常
|
||||
* @author lroyia
|
||||
* @since 2021年7月15日 17:22:06
|
||||
*/
|
||||
private static String formatDate(Date date, String format) throws Exception{
|
||||
SimpleDateFormat sdf = new SimpleDateFormat(format);
|
||||
return sdf.format(date);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue