4000-520-616
欢迎来到免疫在线!(蚂蚁淘生物旗下平台)  请登录 |  免费注册 |  询价篮
主营:原厂直采,平行进口,授权代理(蚂蚁淘为您服务)
咨询热线电话
4000-520-616
当前位置: 首页 > 新闻动态 >
热卖商品
新闻详情
cnki数据抓取及结构化存储_忙着。。。的博客-CSDN博客
来自 : CSDN技术社区 发布时间:2021-03-24

启动类
根据分项一级级抓取

package com.rhhz;import java.io.File;import java.io.FileFilter;import java.util.ArrayList;import java.util.List;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import javax.persistence.EntityManager;import javax.persistence.EntityManagerFactory;import javax.persistence.Query;import javax.servlet.ServletContextEvent;import javax.servlet.ServletContextListener;import javax.servlet.annotation.WebListener;import org.springframework.web.context.WebApplicationContext;import org.springframework.web.context.support.WebApplicationContextUtils;import com.rhhz.core.element.metadata.bean.Article;import com.rhhz.core.element.metadata.bean.Journal;import com.rhhz.core.element.metadata.bean.JournalCatalog;import com.rhhz.datatransfer.cnki.CnkiArticleMetaByJournalSipder; WebListenerpublic class SystemInitListenerCnki implements ServletContextListener{ Override public void contextDestroyed(ServletContextEvent sce) { } private static WebApplicationContext webApplicationContext; Override public void contextInitialized(ServletContextEvent event) { webApplicationContext WebApplicationContextUtils.getWebApplicationContext(event.getServletContext()); EntityManagerFactory entityManagerFactory (EntityManagerFactory) webApplicationContext.getBean( entityManagerFactory ); EntityManager entityManager entityManagerFactory.createEntityManager(); String journalURL https://navi.cnki.net/knavi/JournalDetail?pcode CJFD pykm SPKJ ; String journalPath D:\\\\webDriver\\\\SPKJ ; //1.获取期刊目次论文链接 //new CnkiArticleMetaByJournalSipder().spiderArticleLinks(journalURL,journalPath,2017,24, ,0); //2.创建期刊目次 //createJournalCatalog(entityManager, journalPath, journalURL); /************************************************************************************************************************/ //3.【1】论文元数据页抓取 //List JournalCatalog catalogList entityManager.createQuery( FROM JournalCatalog.class.getName() where year 2018 and remark is NULL ORDER BY year DESC,issue DESC ).getResultList(); //handleMetaArticle(entityManager, journalPath, journalURL, catalogList); //3.【2】论文HTML页抓取 List JournalCatalog catalogList entityManager.createQuery( FROM JournalCatalog.class.getName() where year 2018 ORDER BY year DESC,issue DESC ).getResultList(); handleHtmlArticle(entityManager, journalPath, journalURL, catalogList); //4.【3】论文PDF处理 1.知网抓取及处理 2.自定义PDF重命名及处理 //String pdfPath E:\\\\\\\\SPKJ_PDF\\\\\\\\ //java\\需要转义 SQL\\需要转义 //handleCnkiPDF(entityManager, pdfPath); //####重命名 //String sqlPattern select concat( ren pdfPath ,right(article_year,4), - ,article_issue, - ,article_fpage, .pdf ,article_publisher_id, .pdf ) from meta_article //####分类存到对应目次路径 //String sqlPattern select concat( move pdfPath ,article_publisher_id, .pdf , pdfPath ,article_year, \\\\\\\\ ,article_issue, \\\\\\\\PDF\\\\\\\\ ) from meta_article //handleCustomPDF(entityManager, pdfPath,sqlPattern); System.out.println( 程序执行完毕 ); } public void createJournalCatalog(EntityManager entityManager,String journalPath,String journalURL) { File journalDir new File(journalPath); File[] issueFiles journalDir.listFiles(new FileFilter() { Override public boolean accept(File pathname) { if(pathname.getName().startsWith( yq )) return true; return false; } }); String JournalPublisherId ; if(journalURL.indexOf( pykm ) ! -1) { //https://navi.cnki.net/knavi/JournalDetail?pcode CJFD pykm LCGD JournalPublisherId journalURL.substring(journalURL.indexOf( pykm ) 5); if(JournalPublisherId.contains( )) JournalPublisherId JournalPublisherId.substring(0, journalURL.indexOf( ) 1); } Journal journal (Journal) entityManager.createQuery( FROM Journal.class.getName()).getSingleResult(); List JournalCatalog catalogList new ArrayList JournalCatalog (); JournalCatalog catalog null; for (int i 0; i issueFiles.length; i ) { String id issueFiles[i].getName().replace( .txt , ); String catalogStr issueFiles[i].getName().replaceAll( \\\\D , ); catalog new JournalCatalog(); catalog.setId(id); catalog.setYear(catalogStr.substring(0, 4)); catalog.setIssue(catalogStr.substring(4)); catalog.setState( 0 ); catalog.setReleaseState(1); catalog.setJournalId(journal.getId()); if(! .equals(JournalPublisherId)) catalog.setCoverImgSrc( journal/img/cover/ JournalPublisherId catalogStr .png ); catalogList.add(catalog); } try { entityManager.getTransaction().begin(); for (JournalCatalog journalCatalog : catalogList) { entityManager.merge(journalCatalog); } entityManager.getTransaction().commit(); } catch (Exception e) { entityManager.getTransaction().rollback(); } } public void handleMetaArticle(EntityManager entityManager,String journalPath,String journalURL,List JournalCatalog catalogList) { //抓取论文元数据// for (int i i catalogList.size(); i ) {// JournalCatalog catalog catalogList.get(i);// Map String, Object resultMap new CnkiArticleMetaByJournalSipder().spiderMetaArticles(entityManager,journalPath,journalURL,catalog);// if(resultMap null) return;// // try {// //抓取完成后 验证目次是否已被更新过论文数据 “1”表示已存储并跳过// JournalCatalog journalCatalog (JournalCatalog) entityManager.createQuery( FROM JournalCatalog.class.getName() where id :id ).setParameter( id , catalog.getId()).getSingleResult();// if( 1 .equals(journalCatalog.getRemark())) continue;// // entityManager.getTransaction().begin(); // List Article articles (List Article ) resultMap.get( articles // for(Article article:articles){// entityManager.merge(article);// }// entityManager.getTransaction().commit();// entityManager.getTransaction().begin(); // List ArticleBusiness articleBusinesss (List ArticleBusiness ) resultMap.get( articleBusinesss // for(ArticleBusiness articleBusiness:articleBusinesss){// entityManager.merge(articleBusiness);// }// entityManager.getTransaction().commit();// // entityManager.getTransaction().begin();// catalog.setRemark( 1 // entityManager.merge(catalog);// entityManager.getTransaction().commit();// } catch (Exception e) {// entityManager.getTransaction().rollback();// System.out.println(catalog.getYear() _ catalog.getIssue() 抓取失败 // }// } try { String sql SELECT o FROM Article.class.getName() o WHERE abstractinfo not like % % and abstractinfo not like % % and article_year 2016 and article_remark1 is null ; Query query entityManager.createQuery(sql); List Article articles query.getResultList(); ExecutorService executorService Executors.newFixedThreadPool(4); for (int i 0; i articles.size(); i ) { Article article articles.get(i); executorService.execute(new Runnable() { Override public void run() { new CnkiArticleMetaByJournalSipder().updateAbstrat(entityManager, article); try { Thread.sleep(1000); } catch (Exception e) { // TODO: handle exception } } }); } } catch (Exception e) { // TODO: handle exception } } /** * 抓取知网HTML页面数据 * param entityManager * param journalPath * param journalURL * param catalogList */ public void handleHtmlArticle(EntityManager entityManager,String journalPath,String journalURL,List JournalCatalog catalogList) { //抓取HTML ExecutorService executorService Executors.newFixedThreadPool(1); for (int i 0; i catalogList.size(); i ) { JournalCatalog catalog catalogList.get(i); executorService.execute(new Runnable() { Override public void run() { new CnkiArticleMetaByJournalSipder().spiderArticleHTML(entityManager,journalPath,journalURL,catalog); } }); } } public void handleCnkiPDF(EntityManager entityManager,String journalPath) { } SuppressWarnings( unchecked ) public void handleCustomPDF(EntityManager entityManager,String filePath,String sqlPattern) { List JournalCatalog catalogList entityManager.createQuery( FROM JournalCatalog.class.getName() where year 2018 ORDER BY year DESC,issue DESC ).getResultList(); if(catalogList ! null) { for (int i 0; i catalogList.size(); i ) { JournalCatalog catalog catalogList.get(i); if(catalog.getRemark().contains( PDF )) continue; System.out.print( 开始处理 catalog.getYear() _ catalog.getIssue() ); String issuePDFPath filePath File.separator catalog.getYear() File.separator catalog.getIssue() File.separator PDF ; File issueFolder new File(issuePDFPath); if(!issueFolder.exists()) issueFolder.mkdirs(); //select concat(right(article_year,2), - ,article_issue, - ,article_fpage, .pdf ) from meta_article; String sql sqlPattern where article_year catalog.getYear() and article_issue catalog.getIssue() ; Query query entityManager.createNativeQuery(sql); List String list query.getResultList(); Runtime runtime Runtime.getRuntime(); for (int j 0; j list.size(); j ) { try { String cmd list.get(j); runtime.exec( cmd.exe /c cmd); } catch (Exception e) { System.out.println(list.get(j)); } } try { catalog.setRemark(catalog.getRemark() -PDF ); entityManager.getTransaction().begin(); entityManager.persist(catalog); entityManager.getTransaction().commit(); System.out.print( 处理完成 ); } catch (Exception e) { entityManager.getTransaction().rollback(); } System.out.println( ); } } }}

抓取实现

package com.rhhz.datatransfer.cnki;import java.io.File;import java.io.FileFilter;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.Scanner;import java.util.Set;import java.util.concurrent.TimeUnit;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.persistence.EntityManager;import javax.persistence.Query;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.openqa.selenium.By;import org.openqa.selenium.OutputType;import org.openqa.selenium.TakesScreenshot;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeDriverService;import org.openqa.selenium.chrome.ChromeOptions;import org.openqa.selenium.edge.EdgeDriver;import org.openqa.selenium.support.ui.ExpectedConditions;import org.openqa.selenium.support.ui.WebDriverWait;import com.rhhz.core.element.metadata.bean.Article;import com.rhhz.core.element.metadata.bean.ArticleAffiliation;import com.rhhz.core.element.metadata.bean.ArticleAuthor;import com.rhhz.core.element.metadata.bean.ArticleBusiness;import com.rhhz.core.element.metadata.bean.ArticleFundPrj;import com.rhhz.core.element.metadata.bean.ArticleKeyword;import com.rhhz.core.element.metadata.bean.ArticleReference;import com.rhhz.core.element.metadata.bean.Journal;import com.rhhz.core.element.metadata.bean.JournalCatalog;public class CnkiArticleMetaByJournalSipder{ public static String userAgent Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 ; public static void main(String[] args) { //getJournalURL(); new CnkiArticleMetaByJournalSipder().downloadPDF( https://navi.cnki.net/knavi/JournalDetail?pcode CJFD pykm DWXY , ZR , 2002 , 1 ); } public void runtime(String cmd) { try { Runtime runtime Runtime.getRuntime(); runtime.exec(cmd); runtime.exit(0); } catch (Exception e) { e.printStackTrace(); } } /** * 【1】知网抓取目次下链接保存到 filePath * param journalURL 必填参数 * param filePath 必填参数 * param skipYear 辅助参数 * param skipIssue 辅助参数 * param pageIndex 辅助参数 * param count 辅助参数 */ public void spiderArticleLinks(String journalURL,String filePath,int skipYear,int skipIssue,String pageIndex,int count) { File file new File(filePath); File log new File(filePath File.separator log.txt ); FileOutputStream out null; FileOutputStream out2 null; try { if(!file.exists()) file.mkdirs(); if(!log.exists()) log.createNewFile(); } catch (Exception e) { System.out.println( 文件创建失败!!! ); return; } List String errorIssue new ArrayList String (); List String articleLinksByIssue new ArrayList String (); try { WebDriver driver null; ChromeDriverService service null; String chromeDriverPath D:\\\\webDriver\\\\chromedriver.exe ; HashMap String, Object chromePrefs new HashMap String, Object (); ChromeOptions chromeOptions new ChromeOptions(); try { chromePrefs.put( download.default_directory , D:\\\\webDriver ); File chromeDriverFile new File(chromeDriverPath); System.setProperty( webdriver.chrome.driver ,chromeDriverPath); chromeOptions.setExperimentalOption( prefs , chromePrefs); //设置为 headless 模式 必须 //chromeOptions.addArguments( --headless chromeOptions.addArguments( --disable-gpu ); chromeOptions.addArguments( --no-sandbox ); // 禁止弹出拦截 chromeOptions.addArguments( --disable-popup-blocking ); // 禁止默认浏览器检查 chromeOptions.addArguments( no-default-browser-check ); chromeOptions.addArguments( about:histograms ); chromeOptions.addArguments( about:cache ); chromeOptions.addArguments( --start-maximized ); //创建一个 ChromeDriver 接口 service new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build(); service.start(); } catch (IOException e1) { e1.printStackTrace(); } driver new ChromeDriver(chromeOptions); driver.get(journalURL); try { Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } List WebElement pageEles null; try { pageEles new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector( .page-list a ))); } catch (Exception e) { } if(pageEles ! null !pageEles.isEmpty()) { Pattern pattern Pattern.compile( [0-9] ); List String pageList new ArrayList String (); for (int i 0; i pageEles.size(); i ) { String pageStr pageEles.get(i).getText().trim(); if( .equals(pageStr)) pageStr pageEles.get(i).getAttribute( innerHTML ).trim(); Matcher matcher pattern.matcher(pageStr); if(matcher.find()) { pageList.add(pageStr); } } for (int i 0; i pageList.size(); i ) { try { pageEles new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector( .page-list a ))); } catch (Exception e) { } String pageStr pageList.get(i); //翻页跳转验证页 重新加载 if(! .equals(pageIndex) Integer.valueOf(pageStr) Integer.valueOf(pageIndex)) continue; for (WebElement pageEle : pageEles) { String page pageEle.getText().trim(); if( .equals(page)) page pageEle.getAttribute( innerHTML ).trim(); if(pageList.get(i).equals(page)) { pageEle.click(); break; } } //跳转验证界面 递归 if(driver.getCurrentUrl().contains( https://navi.cnki.net/knavi/Home/Validate?returnUrl )) { count ; System.out.println( 跳转验证界面退出 重新加载 count); driver.manage().deleteAllCookies(); driver.quit(); spiderArticleLinks(journalURL,filePath,9999,99,pageStr,count); return; } try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } boolean flag false; WebElement yearIssueEle driver.findElement(By.id( yearissue i)); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String year yearEle.getAttribute( innerText ).trim(); if(Integer.valueOf(year) skipYear) continue; yearEle.click(); //年 try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS); WebElement issEle yearEles.get(j).findElement(By.tagName( dd ));// String isShow issEle.getCssValue( display // if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { try { WebElement issueEle issueEles.get(k); String issue issueEle.getAttribute( innerHTML ).replace( No. , ).trim(); if(Integer.valueOf(year) skipYear Integer.valueOf(issue) skipIssue) continue; String issueId issueEle.getAttribute( id ); File issueFile new File(filePath File.separator issueId .txt ); if(issueFile.exists() issueFile.length() 0) continue; if(!issueFile.exists()) issueFile.createNewFile(); issueEle.click(); //期 try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } //跳转验证界面 递归 if(driver.getCurrentUrl().contains( https://navi.cnki.net/knavi/Home/Validate?returnUrl )) { count ; System.out.println( 跳转验证界面退出 重新加载 count); driver.manage().deleteAllCookies(); driver.quit(); spiderArticleLinks(journalURL,filePath,Integer.valueOf(year),Integer.valueOf(issue),pageStr,count); return; } out new FileOutputStream(issueFile); WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement articleByCatalogEles catalogContentEle.findElements(By.cssSelector( dd .name a )); for (WebElement articleEle : articleByCatalogEles) { String href articleEle.getAttribute( href ); String parameter href.substring(href.indexOf( ? )); //fileName String articleUrl https://kns.cnki.net/kcms/detail/detail.aspx parameter \\r\\n ; out.write(articleUrl.getBytes()); //String artileTitle articleEle.getText(); //String articleHTMLUrl https://kns.cnki.net/KXReader/Detail parameter; //articleLinkList.add(articleUrl); } out2 new FileOutputStream(log,true); String logStr year _ issue : articleByCatalogEles.size() \\r\\n ; out2.write(logStr.getBytes()); } catch (Exception e) { e.printStackTrace(); count ; System.out.println( 文章链接获取失败 【 count 】 ); }finally { try { out.close(); out2.close(); } catch (Exception e2) { } } } } } try { driver.close(); } catch (Exception e) { } }else { WebElement yearIssueEle driver.findElement(By.id( yearissue 0 )); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String year yearEle.getAttribute( innerText ).trim(); if(Integer.valueOf(year) skipYear) continue; yearEle.click(); //年 try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement issEle yearEles.get(j).findElement(By.tagName( dd ));// String isShow issEle.getCssValue( display // if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { try { WebElement issueEle issueEles.get(k); String issue issueEle.getAttribute( innerHTML ).replace( No. , ).trim(); if(Integer.valueOf(year) skipYear Integer.valueOf(issue) skipIssue) continue; String issueId issueEle.getAttribute( id ); issueEle.click(); //期 try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } //跳转验证界面 递归 if(driver.getCurrentUrl().contains( https://navi.cnki.net/knavi/Home/Validate?returnUrl )) { count ; System.out.println( 跳转验证界面退出 重新加载 count); driver.manage().deleteAllCookies(); driver.quit(); spiderArticleLinks(journalURL,filePath,Integer.valueOf(year),Integer.valueOf(issue), ,count); return; } File issueFile new File(filePath File.separator issueId .txt ); if(issueFile.exists() issueFile.length() 0) continue; if(!issueFile.exists()) issueFile.createNewFile(); out new FileOutputStream(issueFile); WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement articleByCatalogEles catalogContentEle.findElements(By.cssSelector( dd .name a )); for (WebElement articleEle : articleByCatalogEles) { String href articleEle.getAttribute( href ); String parameter href.substring(href.indexOf( ? )); //fileName String articleUrl https://kns.cnki.net/kcms/detail/detail.aspx parameter \\r\\n ; out.write(articleUrl.getBytes()); //String artileTitle articleEle.getText(); //String articleHTMLUrl https://kns.cnki.net/KXReader/Detail parameter; //articleLinkList.add(articleUrl); } out2 new FileOutputStream(log,true); String logStr year _ issue : articleByCatalogEles.size() \\r\\n ; out2.write(logStr.getBytes()); } catch (Exception e) { e.printStackTrace(); count ; System.out.println( 文章链接获取失败 【 count 】 ); }finally { try { out.close(); out2.close(); } catch (Exception e2) { } } } } } } catch (Exception e) { e.printStackTrace(); } for (String string : articleLinksByIssue) { System.out.println(string); } } /** * 【2】知网元数据页面抓取 * param entityManager * param journalPath * param journalURL * param catalog * return */ public Map String,Object spiderMetaArticles(EntityManager entityManager,String journalPath,String journalURL,JournalCatalog catalog) { Set String publisherIdSet new HashSet String (); //查询目次catalog、journal对象 //JournalCatalog catalog (JournalCatalog) entityManager.createQuery( FROM JournalCatalog.class.getName() where year year and issue issue ).getSingleResult(); Journal journal (Journal) entityManager.createQuery( FROM Journal.class.getName()).getSingleResult(); System.out.println( catalog.getYear() _ catalog.getIssue() ); String language cn ; Map String,Object dataMap new HashMap String,Object (); List Article articleList new ArrayList Article (); List ArticleBusiness businessList new ArrayList ArticleBusiness (); WebDriver driver null; ChromeDriverService service null; String chromeDriverPath D:\\\\webDriver\\\\chromedriver.exe ; HashMap String, Object chromePrefs new HashMap String, Object (); ChromeOptions chromeOptions new ChromeOptions(); try { chromePrefs.put( download.default_directory , D:\\\\webDriver ); File chromeDriverFile new File(chromeDriverPath); System.setProperty( webdriver.chrome.driver ,chromeDriverPath); chromeOptions.setExperimentalOption( prefs , chromePrefs); //设置为 headless 模式 必须 chromeOptions.addArguments( --headless ); chromeOptions.addArguments( --disable-gpu ); chromeOptions.addArguments( --no-sandbox ); // 禁止弹出拦截 chromeOptions.addArguments( --disable-popup-blocking ); // 禁止默认浏览器检查 chromeOptions.addArguments( no-default-browser-check ); chromeOptions.addArguments( about:histograms ); chromeOptions.addArguments( about:cache ); chromeOptions.addArguments( --start-maximized ); //创建一个 ChromeDriver 接口 service new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build(); service.start(); } catch (IOException e1) { e1.printStackTrace(); } List String articleLinkByYearIssue new ArrayList String (); File issueFile new File(journalPath File.separator catalog.getId() .txt ); if(issueFile.exists()) { try { articleLinkByYearIssue FileUtils.readLines(issueFile); } catch (Exception e) { e.printStackTrace(); } }else{ driver new ChromeDriver(chromeOptions); driver.get(journalURL); try { Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } articleLinkByYearIssue parseArticleHTMLByYearIssue(driver,catalog); } driver new ChromeDriver(chromeOptions); Map String, Article articleMap new HashMap String, Article (); for (int k 0; k articleLinkByYearIssue.size(); k ) { //测试直接抓取最后一条 //if(k ! articleLinkByYearIssue.size()-1) continue; try { if(driver.toString().contains( null )) driver new ChromeDriver(chromeOptions); driver.get(articleLinkByYearIssue.get(k)); String articleTitle driver.getTitle(); String articleHandle1 driver.getWindowHandle(); Article metaArticle sipderArticleMeta(driver, journal, catalog); String publisherId metaArticle.getPublisherId(); if(publisherIdSet.contains(publisherId)) { //页码重复的论文追加序号 while (true) { /***************************页码重复论文***************************/ if(!publisherId.contains( _ )) publisherId publisherId _1 ; if(publisherIdSet.contains(publisherId)) { int count Integer.parseInt(publisherId.split( _ )[1]); publisherId publisherId.replace( _ count, _ (count 1)); }else { metaArticle.setPublisherId(publisherId); break; } } } publisherIdSet.add(metaArticle.getPublisherId()); articleMap.put(publisherId, metaArticle); System.out.println( SELENIUM articleMap.size()); List WebElement totalEles driver.findElements(By.cssSelector( .total-inform span )); String downCount totalEles.get(0).getText().replace( 下载 , ); //业务信息 ArticleBusiness articleBusiness new ArticleBusiness(); articleBusiness.setArticleId(metaArticle.getId()); articleBusiness.setPdfFileName(metaArticle.getId() .pdf ); articleBusiness.setArticleState( -1 ); articleBusiness.setArticleType( 1 ); articleBusiness.setPdfDownCount(Integer.valueOf(downCount)); articleBusiness.setViewCount(0); businessList.add(articleBusiness); } catch (Exception e) { try { if(!driver.toString().contains( null )) { //System.out.println(driver.toString()); driver.quit(); System.out.println(driver.toString()); } } catch (Exception e2) { } // 页码超时 采用Jsoup抓取 CloseableHttpClient client HttpClients.createDefault(); try { //http://journal02.magtech.org.cn/Jwk_xddl/CN/volumn/volumn_59.shtml HttpGet get new HttpGet(articleLinkByYearIssue.get(k)); get.setHeader( User-Agent , Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0 ); CloseableHttpResponse response client.execute(get); HttpEntity entity response.getEntity(); String pageContent EntityUtils.toString(entity, UTF-8 ); Document pageDoc Jsoup.parse(pageContent); Article article getArticleMetaJsoup(pageDoc, journal, catalog, articleLinkByYearIssue.get(k)); String publisherId article.getPublisherId(); if(publisherIdSet.contains(publisherId)) { //页码重复的论文追加序号 while (true) { /***************************页码重复论文***************************/ if(!publisherId.contains( _ )) publisherId publisherId _1 ; if(publisherIdSet.contains(publisherId)) { int count Integer.parseInt(publisherId.split( _ )[1]); publisherId publisherId.replace( _ count, _ (count 1)); }else { article.setPublisherId(publisherId); break; } } } publisherIdSet.add(article.getPublisherId()); articleMap.put(publisherId, article); System.out.println( JSOUP articleMap.size()); String downCount ; Elements totalEles pageDoc.select( .total-inform span ); for (Element totalEle : totalEles) { if(totalEle.text().contains( 下载 )) downCount totalEle.html().replace( 下载 , ); } //业务信息 ArticleBusiness articleBusiness new ArticleBusiness(); articleBusiness.setArticleId(article.getId()); articleBusiness.setPdfFileName(article.getId() .pdf ); articleBusiness.setArticleState( -1 ); articleBusiness.setArticleType( 1 ); if(! .equals(downCount)) articleBusiness.setPdfDownCount(Integer.valueOf(downCount)); articleBusiness.setViewCount(0); businessList.add(articleBusiness); }catch(Exception e2) { e2.printStackTrace(); }finally { try { if(client ! null) client.close(); } catch (IOException e2) { e2.printStackTrace(); } } } if(k articleLinkByYearIssue.size() - 1) { System.out.println( );// for (Entry String, Article set : articleMap.entrySet()) {// System.out.println(set.getKey() set.getValue().getFpage());// } if(driver.toString().contains( null )) { System.out.println(driver.toString()); driver new ChromeDriver(chromeOptions); driver.get(articleLinkByYearIssue.get(k)); } String lastArticleHandle driver.getWindowHandle(); WebElement artilceListEle null; List WebElement elements null; String issueURL ; try { List WebElement crumbEles driver.findElements(By.cssSelector( .top-tip a )); WebElement yearIssueEle crumbEles.get(1); yearIssueEle.click(); List String windows new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(windows.get(windows.size()-1)); issueURL driver.getCurrentUrl(); try {Thread.sleep(2000);} catch (InterruptedException e1) {e1.printStackTrace();} artilceListEle new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfElementLocated(By.id( CataLogContent ))); //System.out.println(artilceListEle.getText()); //List WebElement elements artilceListEle.findElements(By.cssSelector( dt,dd elements new WebDriverWait(driver, 10).until(ExpectedConditions.visibilityOfNestedElementsLocatedBy(artilceListEle, By.cssSelector( dt,dd ))); String categoryName ; for (int i 0; i elements.size(); i ) { WebElement element elements.get(i); if( dt .equals(element.getTagName())) { String lanmu element.getAttribute( innerHTML ); if(!categoryName.equals(lanmu)) categoryName lanmu; }else { WebElement titleEle element.findElement(By.cssSelector( .name a )); String link titleEle.getAttribute( href ).trim(); String publisherID ; if(link.contains( filename )) { //filename LCGD201412004 publisherID link.substring(link.indexOf( filename ) 9); if(publisherID.contains( )) publisherID publisherID.substring(0, publisherID.indexOf( )); }else { WebElement pageEle element.findElement(By.cssSelector( .company )); String page pageEle.getAttribute( innerHTML ).trim(); publisherID catalog.getYear() - catalog.getIssue() - page; } // 根据页码获取Article 存储栏目 if(articleMap.containsKey(publisherID)) { Article newArticle articleMap.get(publisherID); newArticle.setCategoryName(categoryName); newArticle.setCategoryNameCn(categoryName); articleMap.put(publisherID, newArticle); } } } } catch (Exception e) { //JSOUP // 页码超时 采用Jsoup抓取 if(! .equals(issueURL)) { try { Document document Jsoup.parse(driver.getPageSource()); Element catalogContent document.getElementById( CataLogContent ); Elements dtddEles catalogContent.select( dt,dd ); if(!dtddEles.isEmpty()) { String categoryName ; for (Element element : dtddEles) { if( dt .equals(element.tagName())) { String lanmu element.html(); if(!categoryName.equals(lanmu)) categoryName lanmu; }else { Element titleEle element.select( .name a ).get(0); String link titleEle.attr( href ).trim(); String publisherID ; if(link.contains( filename )) { //filename LCGD201412004 publisherID link.substring(link.indexOf( filename ) 9); if(publisherID.contains( )) publisherID publisherID.substring(0, publisherID.indexOf( )); }else { Element pageEle element.getElementsByClass( company ).get(0); String page pageEle.text().trim(); publisherID catalog.getYear() - catalog.getIssue() - page; } // 根据页码获取Article 存储栏目 if(articleMap.containsKey(publisherID)) { Article newArticle articleMap.get(publisherID); newArticle.setCategoryName(categoryName); newArticle.setCategoryNameCn(categoryName); articleMap.put(publisherID, newArticle); } } //System.out.println( #################################################################################### //System.out.println(element.toString()); } } } catch (Exception e2) { System.out.println(catalog.getYear() _ catalog.getIssue() :栏目获取失败 ); } } } for (Entry String, Article set : articleMap.entrySet()) { articleList.add(set.getValue()); } } } try { //清理内存 Runtime runtime Runtime.getRuntime(); runtime.exec( taskkill /f /im chromedriver.exe /t ); runtime.exec( taskkill /f /im chrome.exe /t ); } catch (Exception e) { e.printStackTrace(); } dataMap.put( articles , articleList); dataMap.put( articleBusinesss , businessList); return dataMap; } public Article getArticleMetaJsoup(Document pageDoc,Journal journal,JournalCatalog catalog,String articleLink) { Article article new Article(); List ArticleAuthor authorList new ArrayList ArticleAuthor (); List ArticleAffiliation affiliationList new ArrayList ArticleAffiliation (); List ArticleKeyword keywordList new ArrayList ArticleKeyword (); List ArticleFundPrj fundPrjList new ArrayList ArticleFundPrj (); Element titleEle pageDoc.select( .wx-tit h1 ).get(0); String title titleEle.text(); //作者 Element authorsEle pageDoc.getElementById( authorpart ); Elements authorEles authorsEle.select( span a ); Pattern patternSup Pattern.compile( sup (.*) /sup ); for (int i 0; i authorEles.size(); i ) { String authorStr authorEles.get(i).text().trim(); Matcher matcher patternSup.matcher(authorStr); String tagId ; if(matcher.find()) tagId matcher.group(); authorStr authorStr.replace(tagId, ); ArticleAuthor author new ArticleAuthor(); author.setAuthorName(authorStr); author.setAuthorNameCn(authorStr); author.setSortNumber(i 1); if(! .equals(tagId)) { tagId tagId.replace( sup , ).replace( /sup , ); author.setAuthorTagVal(tagId); String tagValue ; String[] tagValues new String[]{}; if(tagId.contains( , )) { tagValues tagId.split( , ); } if(tagValues.length 1) { for (int j 0; j tagValues.length; j ) { tagValue aff tagValues[j] , ; } if(tagValue.endsWith( , )) tagValue tagValue.substring(0, tagValue.length()-1); }else { tagValue aff tagId; } author.setAddressTagIds(tagValue); } authorList.add(author); } //作者地址 Elements elements pageDoc.select( .wx-tit h3 ); Elements addressEles elements.get(elements.size()-1).getElementsByTag( a ); for (int i 0; i addressEles.size(); i ) { String addressStr addressEles.get(i).text(); ArticleAffiliation affiliation new ArticleAffiliation(); affiliation.setSortNumber(i 1); affiliation.setAddress(addressStr); affiliation.setAddressCn(addressStr); affiliationList.add(affiliation); } //摘要、关键词、基金、分类号 Elements rowEles pageDoc.select( .row ); for (Element rowEle : rowEles) { String content rowEle.text(); if(content.contains( 摘要 )) { String abstractInfo rowEle.getElementById( ChDivSummary ).outerHtml() .replace( span id \\ ChDivSummary\\ name \\ ChDivSummary\\ class \\ abstract-text\\ , ).replace( /span , ); article.setAbstractinfo(abstractInfo); article.setAbstractinfoCn(abstractInfo); }else if(content.contains( 关键词 )) { String keywordStr content.replace( 关键词 , ); String[] keywords keywordStr.split( ); for (int i 0; i keywords.length; i ) { ArticleKeyword keyword new ArticleKeyword(); keyword.setSortNum(i 1); keyword.setKeyword(keywords[i]); keyword.setKeywordCn(keywords[i]); keywordList.add(keyword); } }else if(content.contains( 基金 )) { String fundPrj content.replace( 基金资助 , ); ArticleFundPrj articleFundPrj new ArticleFundPrj(); articleFundPrj.setSortNum(1); articleFundPrj.setFundsInfo(fundPrj); articleFundPrj.setFundsInfoCn(fundPrj); fundPrjList.add(articleFundPrj); }else if (content.contains( 分类号 )) { Elements topSpaceEles rowEle.getElementsByClass( top-space ); for (int i 0; i topSpaceEles.size(); i ) { String text topSpaceEles.get(i).text(); if(text.contains( 分类号 )) { article.setClcNos(text.replace( 分类号 , ).trim()); }else if(text.contains( DOI )) { article.setDoi(text.replace( DOI , ).trim()); } } } } //页码 Element citationEle pageDoc.select( .top-tip ).get(0); String citationStr citationEle.text(); String pageStr ; Pattern pattern Pattern.compile( [0-9] -[0-9] ); Matcher matcher pattern.matcher(citationStr); if(matcher.find()) { //起始页-结束页 pageStr matcher.group(); String[] pages pageStr.split( - ); article.setFpage(pages[0]); article.setLpage(pages[1]); }else { //结束页 pattern Pattern.compile( 第[0-9] 页 ); matcher pattern.matcher(citationStr); if(matcher.find()) { pageStr matcher.group().replace( 第 , ).replace( 页 , ).trim(); article.setFpage(pageStr); } } String publisherId ; if(articleLink.contains( filename )) { publisherId articleLink.substring(articleLink.indexOf( filename )).replace( filename , ); if(publisherId.contains( )) publisherId publisherId.substring(0, publisherId.indexOf( )); } if( .equals(publisherId)) publisherId catalog.getYear() - catalog.getIssue() - pageStr; article.setId(publisherId); article.setArticleNo(publisherId); article.setPublisherId(publisherId); article.setTitle(title); article.setTitleCn(title); article.setLanguage(journal.getLanguage()); article.setYear(catalog.getYear()); article.setIssue(catalog.getIssue()); if(authorList.size() 0) article.setAuthors(authorList); if(affiliationList.size() 0) article.setAffiliations(affiliationList); if(keywordList.size() 0) article.setKeywords(keywordList); if(fundPrjList.size() 0) article.setFundPrjs(fundPrjList); article.setJournal(journal); article.setCatalogId(catalog.getId()); article.setReleaseState(1); return article; } public Article sipderArticleMeta(WebDriver driver,Journal journal,JournalCatalog catalog) { Article article new Article(); //获取文章编号 String currentUrl driver.getCurrentUrl(); String publisherId ; if(currentUrl.contains( filename )) { publisherId currentUrl.substring(currentUrl.indexOf( filename )).replace( filename , ); if(publisherId.contains( )) publisherId publisherId.substring(0, publisherId.indexOf( )); } List ArticleAuthor authorList new ArrayList ArticleAuthor (); List ArticleAffiliation affiliationList new ArrayList ArticleAffiliation (); List ArticleKeyword keywordList new ArrayList ArticleKeyword (); List ArticleFundPrj fundPrjList new ArrayList ArticleFundPrj (); WebElement wxtitEle driver.findElement(By.className( wx-tit )); WebElement titleEle wxtitEle.findElement(By.tagName( h1 )); String title titleEle.getAttribute( innerHTML ); //作者 WebElement authorsEle wxtitEle.findElement(By.id( authorpart )); List WebElement authorEles authorsEle.findElements(By.cssSelector( span )); Pattern patternSup Pattern.compile( sup (.*) /sup ); for (int i 0; i authorEles.size(); i ) { String authorStr ; if(authorEles.get(i).getAttribute( innerHTML ).contains( a )) { authorStr authorEles.get(i).findElement(By.tagName( a )).getAttribute( innerHTML ).trim(); }else { authorStr authorEles.get(i).getAttribute( innerHTML ).trim(); } Matcher matcher patternSup.matcher(authorStr); String tagId ; if(matcher.find()) tagId matcher.group(); authorStr authorStr.replace(tagId, ).replace( i class \\ icon-email\\ /i , ).trim(); ArticleAuthor author new ArticleAuthor(); author.setAuthorName(authorStr); author.setAuthorNameCn(authorStr); author.setSortNumber(i 1); if(! .equals(tagId)) { tagId tagId.replace( sup , ).replace( /sup , ); author.setAuthorTagVal(tagId); String tagValue ; String[] tagValues new String[]{}; if(tagId.contains( , )) { tagValues tagId.split( , ); } if(tagValues.length 1) { for (int j 0; j tagValues.length; j ) { tagValue aff tagValues[j] , ; } if(tagValue.endsWith( , )) tagValue tagValue.substring(0, tagValue.length()-1); }else { tagValue aff tagId; } author.setAddressTagIds(tagValue); } authorList.add(author); } //作者地址 List WebElement findElements wxtitEle.findElements(By.tagName( h3 )); List WebElement addressEles findElements.get(findElements.size()-1).findElements(By.tagName( a )); for (int i 0; i addressEles.size(); i ) { String addressStr addressEles.get(i).getAttribute( innerHTML ); ArticleAffiliation affiliation new ArticleAffiliation(); affiliation.setSortNumber(i 1); affiliation.setAddress(addressStr); affiliation.setAddressCn(addressStr); affiliationList.add(affiliation); //findElements.remove(addressEles.get(i)); } //摘要、关键词、基金、分类号 List WebElement rowEles driver.findElements(By.className( row )); for (WebElement rowEle : rowEles) { String content rowEle.getText(); if(content.contains( 摘要 )) { try { WebElement moreEle rowEle.findElement(By.id( ChDivSummaryMore )); if(moreEle.isDisplayed()) moreEle.click(); } catch (Exception e) { } String abstractInfo rowEle.findElement(By.id( ChDivSummary )).getAttribute( innerHTML ) .replace( span id \\ ChDivSummary\\ name \\ ChDivSummary\\ class \\ abstract-text\\ , ).replace( /span , ); article.setAbstractinfo(abstractInfo); article.setAbstractinfoCn(abstractInfo); }else if(content.contains( 关键词 )) { String keywordStr content.replace( 关键词 , ); String[] keywords keywordStr.split( ); for (int i 0; i keywords.length; i ) { ArticleKeyword keyword new ArticleKeyword(); keyword.setSortNum(i 1); keyword.setKeyword(keywords[i]); keyword.setKeywordCn(keywords[i]); keywordList.add(keyword); } }else if(content.contains( 基金 )) { String fundPrj content.replace( 基金资助 , ); ArticleFundPrj articleFundPrj new ArticleFundPrj(); articleFundPrj.setSortNum(1); articleFundPrj.setFundsInfo(fundPrj); articleFundPrj.setFundsInfoCn(fundPrj); fundPrjList.add(articleFundPrj); }else if (content.contains( 分类号 )) { List WebElement topSpaceEles rowEle.findElements(By.cssSelector( .top-space )); for (int i 0; i topSpaceEles.size(); i ) { String text topSpaceEles.get(i).getAttribute( innerText ); if(text.contains( 分类号 )) { article.setClcNos(text.replace( 分类号 , ).trim()); }else if(text.contains( DOI )) { article.setDoi(text.replace( DOI , ).trim()); } } } } //页码 WebElement citationEle driver.findElement(By.className( top-tip )); String citationStr citationEle.getText(); String pageStr ; Pattern pattern Pattern.compile( [0-9] -[0-9] ); Matcher matcher pattern.matcher(citationStr); if(matcher.find()) { //起始页-结束页 pageStr matcher.group(); String[] pages pageStr.split( - ); article.setFpage(pages[0]); article.setLpage(pages[1]); }else { //结束页 pattern Pattern.compile( 第[0-9] 页 ); matcher pattern.matcher(citationStr); if(matcher.find()) { pageStr matcher.group().replace( 第 , ).replace( 页 , ).trim(); article.setFpage(pageStr); } } if( .equals(publisherId)) publisherId catalog.getYear() - catalog.getIssue() - pageStr; article.setId(publisherId); article.setArticleNo(publisherId); article.setPublisherId(publisherId); article.setTitle(title); article.setTitleCn(title); article.setLanguage(journal.getLanguage()); article.setYear(catalog.getYear()); article.setIssue(catalog.getIssue()); if(authorList.size() 0) article.setAuthors(authorList); if(affiliationList.size() 0) article.setAffiliations(affiliationList); if(keywordList.size() 0) article.setKeywords(keywordList); if(fundPrjList.size() 0) article.setFundPrjs(fundPrjList); article.setJournal(journal); article.setCatalogId(catalog.getId()); article.setReleaseState(1); return article; } /** * 抓取论文链接地址 * param driver * param year * param issue * return */ public static List String parseArticleHTMLByYearIssue(WebDriver driver,JournalCatalog catalog){ List String articleLinkList new ArrayList String (); List String articleHTMLLinkList new ArrayList String (); String year catalog.getYear(); String issue catalog.getIssue(); WebDriverWait wait new WebDriverWait(driver, 10); List WebElement pageEles null; try { driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS); pageEles wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector( .page-list a ))); } catch (Exception e) { //driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS); pageEles driver.findElements(By.cssSelector( .page-list a )); } if(pageEles ! null !pageEles.isEmpty()) { Pattern pattern Pattern.compile( [0-9] ); for (int i 0; i pageEles.size(); i ) { //判断页码元素值是否为数字 String pageStr pageEles.get(i).getText().trim(); if( .equals(pageStr)) pageStr pageEles.get(i).getAttribute( innerHTML ).trim(); Matcher matcher pattern.matcher(pageStr); if(matcher.find()) { boolean flag false; WebElement yearIssueEle driver.findElement(By.id( yearissue i)); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String yearText yearEle.getText().trim(); if( .equals(yearText)) yearText yearEle.getAttribute( innerText ); if(year.equals(yearText)) { //指定页 flag true; pageEles.get(i).click(); break; } } if(flag) {//切换到当前页 for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String yearText yearEle.getText().trim(); if( .equals(yearText)) yearText yearEle.getAttribute( innerText ); if(year.equals(yearText)) { //指定年 yearEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement issEle yearEles.get(j).findElement(By.tagName( dd ));// String isShow issEle.getCssValue( display // if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { WebElement issueEle issueEles.get(k); if(Integer.parseInt(issue) 10 !issue.contains( 0 )) issue 0 issue; if(issue.equals(issueEle.getAttribute( innerHTML ).replace( No. , ).trim())) { //指定期 issueEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement articleByCatalogEles catalogContentEle.findElements(By.cssSelector( dd .name a )); for (WebElement articleEle : articleByCatalogEles) { String href articleEle.getAttribute( href ); String parameter href.substring(href.indexOf( ? )); //fileName String artileTitle articleEle.getText(); String articleUrl https://kns.cnki.net/kcms/detail/detail.aspx parameter; String articleHTMLUrl https://kns.cnki.net/KXReader/Detail parameter; //System.out.println(artileTitle); //System.out.println(articleUrl); //System.out.println(articleHTMLUrl); articleLinkList.add(articleUrl); articleHTMLLinkList.add(articleHTMLUrl); } } } } } //System.out.println( 数据抓取完成 System.out.println( year _ issue articleLinkList.size() 篇 ); driver.quit(); break; } }else { WebElement yearIssueEle driver.findElement(By.id( yearissue 0 )); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String yearText yearEle.getText().trim(); if( .equals(yearText)) yearText yearEle.getAttribute( innerText ); if(year.equals(yearText)) { //只有一页,指定年 yearEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement issEle yearEles.get(j).findElement(By.tagName( dd ));// String isShow issEle.getCssValue( display // if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { WebElement issueEle issueEles.get(k); if(Integer.parseInt(issue) 10 !issue.contains( 0 )) issue 0 issue; if(issue.equals(issueEle.getAttribute( innerHTML ).replace( No. , ).trim())) { //指定期 issueEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement articleByCatalogEles catalogContentEle.findElements(By.cssSelector( dd .name a )); for (WebElement articleEle : articleByCatalogEles) { String href articleEle.getAttribute( href ); String parameter href.substring(href.indexOf( ? )); //fileName String artileTitle articleEle.getText(); String articleUrl https://kns.cnki.net/kcms/detail/detail.aspx parameter; String articleHTMLUrl https://kns.cnki.net/KXReader/Detail parameter; System.out.println(artileTitle); System.out.println(articleUrl); System.out.println(articleHTMLUrl); articleLinkList.add(articleUrl); articleHTMLLinkList.add(articleHTMLUrl); } } } } } driver.quit(); } } } return articleLinkList; } /** * 【3】知网全文HTML页抓取 * param entityManager * param journalPath * param journalURL * param catalog * return */ SuppressWarnings( unchecked ) public void spiderArticleHTML(EntityManager entityManager,String journalPath,String journalURL,JournalCatalog catalog) { if(catalog.getRemark().contains( HTML )) return; System.out.println( catalog.getYear() _ catalog.getIssue() :开始处理 ); List Article newArticleList new ArrayList Article (); List Article articleList entityManager.createQuery( FROM Article.class.getName() WHERE catalogId catalog.getId() ).getResultList(); try { // 谷歌驱动配置 WebDriver driver null; ChromeDriverService service null; String chromeDriverPath D:\\\\webDriver\\\\chromedriver.exe ; HashMap String, Object chromePrefs new HashMap String, Object (); ChromeOptions chromeOptions new ChromeOptions(); try { chromePrefs.put( download.default_directory , D:\\\\webDriver ); File chromeDriverFile new File(chromeDriverPath); System.setProperty( webdriver.chrome.driver , chromeDriverPath); chromeOptions.setExperimentalOption( prefs , chromePrefs); // 设置为 headless 模式 必须 // chromeOptions.addArguments( --headless chromeOptions.addArguments( --disable-gpu ); chromeOptions.addArguments( --no-sandbox ); // 禁止弹出拦截 chromeOptions.addArguments( --disable-popup-blocking ); // 禁止默认浏览器检查 chromeOptions.addArguments( no-default-browser-check ); chromeOptions.addArguments( about:histograms ); chromeOptions.addArguments( about:cache ); chromeOptions.addArguments( --start-maximized ); // 创建一个 ChromeDriver 接口 service new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort() .build(); service.start(); } catch (IOException e1) { e1.printStackTrace(); } driver new ChromeDriver(chromeOptions); int htmlCount 0; for (int i 0; i articleList.size(); i ) { CloseableHttpClient client HttpClients.createDefault(); String firstPage ; try { Article article articleList.get(i); if( HTML .equals(article.getRemark2())) { htmlCount htmlCount 1; continue; //表示抓取过知网HTML页面 } String link https://kns.cnki.net/kcms/detail/detail.aspx?sfield FN dbCode CJFD filename article.getId() tableName CJFD2000 url ; HttpGet get new HttpGet(link); get.setHeader( User-Agent , Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0 ); CloseableHttpResponse response client.execute(get); HttpEntity entity response.getEntity(); String pageContent EntityUtils.toString(entity, UTF-8 ); Document pageDoc Jsoup.parse(pageContent); Element btnEles pageDoc.getElementById( DownLoadParts ); String btnName btnEles.text(); if (!btnName.contains( HTML阅读 )) continue; //没有全文按钮不抓取 driver.manage().timeouts().implicitlyWait(15, TimeUnit.SECONDS); driver.manage().timeouts().pageLoadTimeout(30, TimeUnit.SECONDS); // 跳转全文页 driver.get(link); try { Thread.sleep(3000); } catch (Exception e) { // TODO: handle exception } firstPage driver.getWindowHandle(); WebElement element driver.findElement(By.id( DownLoadParts )); WebElement btnHTMLEle element.findElement(By.cssSelector( .btn-html a )); btnHTMLEle.click(); try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } List String windows new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(windows.get(windows.size() - 1)); if( 安全验证 .equals(driver.getTitle())) { while (true) { if(! 安全验证 .equals(driver.getTitle())) break; } } Document document Jsoup.parse(driver.getPageSource()); System.out.println(article.getTitle() 开始抓取HTML ); Element contentEle document.getElementsByClass( content ).get(0); Elements titleEles contentEle.getElementsByTag( h1 ); if (titleEles.size() 1) { // 包含英文标题再继续 List ArticleAuthor authorList new ArrayList ArticleAuthor (); List ArticleAffiliation affiliationList new ArrayList ArticleAffiliation (); List ArticleKeyword keywordList new ArrayList ArticleKeyword (); List ArticleReference referenceList new ArrayList ArticleReference (); Element titleEnEle titleEles.get(titleEles.size() - 1); String titleEn titleEnEle.text(); List String bioList new ArrayList String (); Elements briefEles contentEle.select( .brief ); if(briefEles.size() 0) { Elements pEles briefEles.get(0).getElementsByTag( p ); for (int j 0; j pEles.size(); j ) { String content pEles.get(j).text(); if(content.contains( 作者简介 )) { Elements authroBioEles pEles.get(j).getElementsByTag( span ); for (int k 0; k authroBioEles.size(); k ) { bioList.add(authroBioEles.get(k).text()); } }else if(content.contains( 收稿日期 )) { String received content.replace( 收稿日期 , ); article.setReceivedDate(new SimpleDateFormat( yyyy-MM-dd ).parse(received)); } } } Element authorEnEle titleEnEle.nextElementSibling(); if ( h2 .equals(authorEnEle.tagName())) { Elements authorEnEles authorEnEle.getElementsByTag( span ); List ArticleAuthor authors article.getAuthors(); if(authors.size() ! authorEnEles.size()) { //补救数据-------------------------------------------------------------------------------------------- Element authorCnEle contentEle.selectFirst( .top-title h2 ); Elements authorCnEles authorCnEle.select( a ); System.out.println( 中英文作者不一致: article.getId() [ authors.size() -CN: authorCnEles.size() -EN: authorEnEles.size() ] ); for (int j 0; j authorCnEles.size(); j ) { //遍历中文 Element authorCnStrEle authorCnEles.get(j); String authorCn authorCnStrEle.text(); boolean flag false; //判断作者集合是否存在当前作者 for (int k 0; k authors.size(); k ) { //遍历库中作者 ArticleAuthor author authors.get(k); if(authorCn.equals(author.getAuthorNameCn())) { flag true; try { author.setAuthorNameEn(authorEnEles.get(j).text()); } catch (Exception e) { // TODO: handle exception } author.setSortNumber(j 1); if(bioList.size() 0) { for (int j2 0; j2 bioList.size(); j2 ) { if(bioList.get(j2).contains(author.getAuthorNameCn())) { author.setBio(bioList.get(j2)); author.setBioCn(bioList.get(j2)); } } } authorList.add(author); } } if(!flag) { ArticleAuthor author new ArticleAuthor(); author.setArticleId(article.getId()); author.setAuthorName(authorCn); author.setAuthorNameCn(authorCn); try { author.setAuthorNameEn(authorEnEles.get(j).text()); } catch (Exception e) { // TODO: handle exception } author.setSortNumber(j 1); if(bioList.size() 0) { for (int j2 0; j2 bioList.size(); j2 ) { if(bioList.get(j2).contains(author.getAuthorNameCn())) { author.setBio(bioList.get(j2)); author.setBioCn(bioList.get(j2)); } } } authorList.add(author); } } //----------------------------------------------------------------------------------------------------- }else { for (int j 0; j authors.size(); j ) { ArticleAuthor author authors.get(j); author.setAuthorNameEn(authorEnEles.get(j).text()); if(bioList.size() 0) { for (int j2 0; j2 bioList.size(); j2 ) { if(bioList.get(j2).contains(author.getAuthorNameCn())) { author.setBio(bioList.get(j2)); author.setBioCn(bioList.get(j2)); } } } authorList.add(author); } } } Element addressEnEle authorEnEle.nextElementSibling(); if ( h2 .equals(addressEnEle.tagName())) { Elements addressEnEles addressEnEle.getElementsByTag( span ); List ArticleAffiliation affiliations article.getAffiliations(); if(affiliations.size() ! addressEnEles.size()) { System.out.println( 中英文地址不一致: article.getId()); Element addressCnEle contentEle.selectFirst( .top-title h2 h2 ); Elements addressCnEles addressCnEle.select( a,span ); for (int j 0; j addressCnEles.size(); j ) { String addressCn addressCnEles.get(j).text(); ArticleAffiliation affiliation null; if(affiliations.size() j) { affiliation affiliations.get(j); }else { affiliation new ArticleAffiliation(); affiliation.setArticleId(article.getId()); } affiliation.setAddress(addressCn); affiliation.setAddressCn(addressCn); try { affiliation.setAddressEn(addressEnEles.get(j).text()); } catch (Exception e) { // TODO: handle exception } affiliation.setSortNumber(j 1); affiliationList.add(affiliation); } }else { for (int j 0; j affiliations.size(); j ) { ArticleAffiliation affiliation affiliations.get(j); affiliation.setAddressEn(addressEnEles.get(j).text()); affiliationList.add(affiliation); } } } Element abstractEle contentEle.select( #a_abstractEN p ).get(0); String abstractEn abstractEle.outerHtml().replace( p , ).replace( /p , ); Elements keywordEles contentEle.select( #a_keywordsEN p a ); if(keywordEles.size() 0) { List ArticleKeyword keywords article.getKeywords(); if(keywords.size() ! keywordEles.size()) { Element keywordCnEle contentEle.getElementById( a_keywords ); Elements keywordCnEles keywordCnEle.select( p a ); System.out.println( 中英文关键词不一致: article.getId() [ keywords.size() -CN: keywordCnEles.size() -EN: keywordEles.size() ] ); if(keywordCnEles.size() keywordEles.size()) { for (int j 0; j keywordCnEles.size(); j ) { String keywordCn keywordCnEles.get(j).text(); ArticleKeyword keyword null; if(keywords.size() j) { keyword keywords.get(j); }else { keyword new ArticleKeyword(); keyword.setArticleId(article.getId()); } keyword.setKeyword(keywordCn); keyword.setKeywordCn(keywordCn); try { keyword.setKeywordEn(keywordEles.get(j).text()); } catch (Exception e) { if(keywordEles.size() keywordCnEles.size()) { System.out.println( 英文关键词比中文少: (keywordCnEles.size()-keywordEles.size())); } } keyword.setSortNum(j 1); keywordList.add(keyword); } }else { //英文多于中文关键词 for (int j 0; j keywordEles.size(); j ) { String keywordEn keywordEles.get(j).text(); ArticleKeyword keyword null; if(keywords.size() j) { keyword keywords.get(j); }else { keyword new ArticleKeyword(); keyword.setArticleId(article.getId()); } keyword.setKeywordEn(keywordEn); try { keyword.setKeyword(keywordCnEles.get(j).text()); keyword.setKeywordCn(keywordCnEles.get(j).text()); } catch (Exception e) { keyword.setKeyword( ); keyword.setKeywordCn( ); if(keywordEles.size() keywordCnEles.size()) { System.out.println( 英文关键词比中文多: (keywordEles.size()-keywordCnEles.size())); } } keyword.setSortNum(j 1); keywordList.add(keyword); } } }else { for (int j 0; j keywords.size(); j ) { String keywordStr keywordEles.get(j).text(); if(!keywordStr.contains( )) keywordStr keywordStr.replace( , ); ArticleKeyword keyword keywords.get(j); keyword.setKeywordEn(keywordStr); keywordList.add(keyword); } } } Elements referEles contentEle.select( #a_bibliography p ); for (int j 0; j referEles.size(); j ) { Element referEle referEles.get(j).selectFirst( a ); String sortStr ; if(referEle.selectFirst( b ) ! null) { sortStr referEle.selectFirst( b ).text(); }else { sortStr (j 1) ; } String referStr referEle.html().replace(sortStr, ).replace( b , ).replace( /b , ); ArticleReference reference new ArticleReference(); reference.setArticleId(article.getId()); reference.setAllinfo(referStr); if( [] .equals(sortStr)) sortStr (j 1) ; reference.setSortnum(Integer.parseInt(sortStr.replace( [ , ).replace( ] , ))); Pattern pattern Pattern.compile( [\\u4e00-\\u9fa5] ); Matcher matcher pattern.matcher(referStr); if(matcher.find()) { //中文 reference.setAllinfoPartCn(referStr); }else { reference.setAllinfoPartEn(referStr); } referenceList.add(reference); } article.setTitleEn(titleEn); article.setAbstractinfoEn(abstractEn); article.setAuthors(authorList); article.setAffiliations(affiliationList); article.setKeywords(keywordList); article.setRefers(referenceList); article.setRemark2( HTML ); newArticleList.add(article); } } catch (Exception e) { e.printStackTrace(); } //driver.quit(); if(driver.toString().contains( null )) { driver new ChromeDriver(chromeOptions); }else { //System.out.println(driver.toString()); List String pageList new ArrayList String (driver.getWindowHandles()); for (int j 0; j pageList.size(); j ) { //if(firstPage.equals(pageList.get(j))) continue; if(j 0) continue; driver.switchTo().window(pageList.get(j)); driver.close(); } //System.out.println(driver.toString()); pageList new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(pageList.get(pageList.size() - 1));// pageList new ArrayList String (driver.getWindowHandles());// if(pageList.size() 1) {// driver.switchTo().window(pageList.get(0));// }else {// driver.manage().deleteAllCookies();// driver.quit();// driver new ChromeDriver(chromeOptions);// } } } driver.manage().deleteAllCookies(); driver.quit(); //抓取完成后 验证目次是否已被更新过论文数据 “1”表示已存储并跳过 JournalCatalog journalCatalog (JournalCatalog) entityManager.createQuery( FROM JournalCatalog.class.getName() where id :id ).setParameter( id , catalog.getId()).getSingleResult(); if(journalCatalog.getRemark().contains( HTML )) return; //表示已抓取 catalog.setRemark(catalog.getRemark() _HTML ); try { if(newArticleList.size() 0) { entityManager.getTransaction().begin(); for (int i 0; i newArticleList.size(); i ) { Article article newArticleList.get(i); entityManager.persist(article); } entityManager.getTransaction().commit(); } if(articleList.size() htmlCount) { entityManager.getTransaction().begin(); entityManager.persist(catalog); entityManager.getTransaction().commit(); } } catch (Exception e) { entityManager.getTransaction().rollback(); e.printStackTrace(); System.out.println(catalog.getYear() _ catalog.getIssue() :该期已抓取提交 跳过 ); } } catch (Exception e) { e.printStackTrace(); System.out.println(catalog.getYear() _ catalog.getIssue() :数据抓取失败 ); } } /** * 获取期刊目次集合 * param driver * return */ public static List JournalCatalog getCnkiJournalCatalogList(WebDriver driver){ WebElement coverImgEle driver.findElement(By.cssSelector( #J_journalPic img )); String coverImgURL coverImgEle.getAttribute( src ).startsWith( http )? coverImgEle.getAttribute( src ): http: coverImgEle.getAttribute( src ); boolean flag false; //判断图片url是否带期号 String imgName coverImgURL.substring(coverImgURL.lastIndexOf( / ) 1);// System.out.println(coverImgURL); List JournalCatalog issueList new ArrayList JournalCatalog (); JournalCatalog catalog null; List WebElement pageEles driver.findElements(By.cssSelector( .page-list a )); if(pageEles ! null) { Pattern pattern Pattern.compile( [0-9] ); //判断图片名称是否包含数字 Matcher matcherImg pattern.matcher(imgName); if(matcherImg.find()) flag true; for (int i 0; i pageEles.size(); i ) { //判断页码元素值是否为数字 String pageStr pageEles.get(i).getText().trim(); if( .equals(pageStr)) pageStr pageEles.get(i).getAttribute( innerHTML ).trim(); Matcher matcher pattern.matcher(pageStr); if(matcher.find()) { WebElement yearIssueEle driver.findElement(By.id( yearissue i)); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl ));//年 for(int j 0;j yearEles.size();j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); String year yearEle.getText().trim(); if( .equals(year)) year yearEle.getAttribute( innerHTML ).trim().replace( em , ).replace( /em , ); List WebElement issueEles yearEles.get(j).findElements(By.tagName( a ));//期 for (WebElement issueEle : issueEles) { String issueId issueEle.getAttribute( id ); String issueNo issueEle.getText().trim().replace( No. , ); if( .equals(issueNo)) issueNo issueEle.getAttribute( innerHTML ).trim().replace( No. , ); String currentIssueImgName ; if(flag) { currentIssueImgName imgName.substring(0, 4) issueId.replace( yq , ) .jpg ; }else { currentIssueImgName imgName.replace( .jpg , issueId.replace( yq , ) .jpg ); } catalog new JournalCatalog(); catalog.setId(issueId); catalog.setIssue(issueNo); catalog.setYear(year); catalog.setCoverImgSrc( journal/img/cover/ currentIssueImgName); issueList.add(catalog); } } } } return issueList; } return null; } public static List String parseIssueHTML(WebDriver driver){ List WebElement pageEles driver.findElements(By.cssSelector( .page-list a )); if(pageEles ! null) { Pattern pattern Pattern.compile( [0-9] ); for (int i 0; i pageEles.size(); i ) { //判断页码元素值是否为数字 String pageStr pageEles.get(i).getText().trim(); if( .equals(pageStr)) pageStr pageEles.get(i).getAttribute( innerHTML ).trim(); Matcher matcher pattern.matcher(pageStr); if(matcher.find()) { WebElement yearIssueEle driver.findElement(By.id( yearissue i)); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); yearEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } WebElement issEle yearEles.get(j).findElement(By.tagName( dd )); String isShow issEle.getCssValue( display ); if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { WebElement issueEle issueEles.get(k); issueEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement articleByCatalogEles catalogContentEle.findElements(By.cssSelector( dd .name a )); for (WebElement articleEle : articleByCatalogEles) { String href articleEle.getAttribute( href ); System.out.println(articleEle.getAttribute( href )); String query href.substring(href.indexOf( ? )); String articleUrl https://kns.cnki.net/kcms/detail/detail.aspx query; try { CloseableHttpClient httpClient HttpClients.createDefault(); HttpPost get new HttpPost(articleUrl); get.addHeader( User-Agent ,userAgent); CloseableHttpResponse response httpClient.execute(get); HttpEntity entity response.getEntity(); InputStream in entity.getContent(); Document parse Jsoup.parse(in, utf-8 , ); Document document Jsoup.connect(articleUrl).get(); String content document.outerHtml(); } catch (Exception e) { // TODO: handle exception } articleEle.click(); List String tabs new ArrayList String (driver.getWindowHandles()); //存储多窗口句柄 //切换新标签页 WebDriver driver2 driver.switchTo().window(tabs.get(2)); WebElement downloadEle driver2.findElement(By.id( DownLoadParts )); WebElement htmlEle downloadEle.findElement(By.className( icon-dlcrsp )); String htmlRead htmlEle.getAttribute( innerHTML ).trim(); } } } } } } return null; } /** * 获取期刊知网地址 */ public static void getJournalURL() { String indexURL http://navi.cnki.net/knavi/Journal.html ; WebDriver driver null; ChromeDriverService service null; String chromeDriverPath D:\\\\webDriver\\\\chromedriver.exe ; try { HashMap String, Object chromePrefs new HashMap String, Object (); chromePrefs.put( download.default_directory , D:\\\\webDriver ); File chromeDriverFile new File(chromeDriverPath); System.setProperty( webdriver.chrome.driver ,chromeDriverPath); ChromeOptions chromeOptions new ChromeOptions(); chromeOptions.setExperimentalOption( prefs , chromePrefs); //设置为 headless 模式 必须 chromeOptions.addArguments( --headless ); chromeOptions.addArguments( --disable-gpu ); chromeOptions.addArguments( --no-sandbox ); // 禁止弹出拦截 chromeOptions.addArguments( --disable-popup-blocking ); // 禁止默认浏览器检查 chromeOptions.addArguments( no-default-browser-check ); chromeOptions.addArguments( about:histograms ); chromeOptions.addArguments( about:cache ); chromeOptions.addArguments( --start-maximized ); //创建一个 ChromeDriver 接口 service new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build(); service.start(); driver new ChromeDriver(chromeOptions); } catch (IOException e1) { e1.printStackTrace(); } driver.get(indexURL); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebDriverWait wait new WebDriverWait(driver, 30); Boolean isShow wait.until(ExpectedConditions.titleContains( 出版来源导航 )); WebElement journalNameEle driver.findElement(By.name( txt_1_value1 )); String journalName ; if(journalNameEle ! null) { Scanner scanner new Scanner(System.in); System.out.println( 请输入期刊名称 ); journalName scanner.next(); journalNameEle.sendKeys(journalName); } WebElement search driver.findElement(By.id( btnSearch )); search.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement totalcountEle driver.findElement(By.className( totalcount )); WebElement journalCountBySelEle totalcountEle.findElement(By.className( lblCount )); int journalCount Integer.parseInt(journalCountBySelEle.getText()); if(journalCount 0) { System.out.println( 未查询或知网未收录本刊: journalName); return; }else if(journalCount 1){ System.out.println( 查询到了 恭喜 ); WebElement resultEle driver.findElement(By.cssSelector( .result .list_tup )); List WebElement journalEles null; if(resultEle ! null) journalEles resultEle.findElements(By.cssSelector( li a )); if(journalEles ! null journalEles.size() 0) { WebElement journalEle journalEles.get(0); journalEle.click(); } }else { // 大于1条 WebElement resultEle driver.findElement(By.cssSelector( .result .list_tup )); List WebElement journalEles null; if(resultEle ! null) journalEles resultEle.findElements(By.cssSelector( li a )); if(journalEles ! null journalEles.size() 0) { for(int i 0;i journalEles.size();i ) { String journalTitle journalEles.get(i).getAttribute( title ); System.out.println(i 1 : journalTitle); } System.out.println( 选择期刊 选择期刊进行抓取 ); Scanner scanner new Scanner(System.in); String journalIndex scanner.next(); WebElement journalEle journalEles.get(Integer.parseInt(journalIndex)-1); journalEle.click(); } } // System.out.println(driver.getCurrentUrl());// System.out.println(driver.getTitle()); List String tabs new ArrayList String (driver.getWindowHandles()); //存储多窗口句柄// for (String string : tabs) {// System.out.println(string);// } //切换新标签页 WebDriver driver2 driver.switchTo().window(tabs.get(1)); String journalURL driver2.getCurrentUrl(); String journalPublisherId journalURL.substring(journalURL.indexOf( pykm ) 5); if(journalPublisherId.indexOf( ) ! -1) journalPublisherId journalPublisherId.substring(0, journalPublisherId.indexOf( )); System.out.println(driver2.getCurrentUrl()); System.out.println(driver2.getTitle()); driver2.quit(); } public void downloadPDF(String journalUrl,String journalName,String year,String issue) { Set String publisherIdSet new HashSet String (); //查询目次catalog、journal对象 JournalCatalog catalog new JournalCatalog(); catalog.setYear(year); catalog.setIssue(issue); File browserSavePath new File( C:\\\\Users\\\\wsh\\\\Downloads ); String language cn ; System.setProperty( webdriver.edge.driver , D:\\\\webDriver\\\\msedgedriver.exe ); Map String,Object dataMap new HashMap String,Object (); List Article articleList new ArrayList Article (); List ArticleBusiness businessList new ArrayList ArticleBusiness (); WebDriver driver new EdgeDriver(); //driver.manage().window().maximize(); driver.get(journalUrl); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } List String articleLinkByYearIssue (List String ) parseArticleHTMLByYearIssue(driver,catalog); driver new EdgeDriver(); Map String, Article articleMap new HashMap String, Article (); for (int k 0; k articleLinkByYearIssue.size(); k ) { try { driver.get(articleLinkByYearIssue.get(k)); } catch (Exception e) { try { driver.manage().wait(3000); driver.notify(); driver.get(articleLinkByYearIssue.get(k)); } catch (Exception e2) { System.out.println( 页面打开失败 ); } } WebDriverWait wait new WebDriverWait(driver, 10); WebElement btnsEle wait.until(ExpectedConditions.presenceOfElementLocated(By.id( DownLoadParts ))); //WebElement btnsEle driver.findElement(By.id( DownLoadParts List WebElement authorEles driver.findElements(By.cssSelector( #authorpart span )); if(!authorEles.isEmpty()) { WebElement authorEle authorEles.get(0); String authorStr authorEle.getText(); FileFilter filter new FileFilter() { Override public boolean accept(File file) { if(file.isFile()) { if(file.getName().contains(authorStr)) { return true; }else { return false; } } return false; } }; File[] files browserSavePath.listFiles(filter); if(files.length 0) { for (File file : files) { System.out.println(file.getAbsolutePath() 文件重复啦啦啦啦啦 ); } continue; } } WebElement btnDownloadEle null; try { btnDownloadEle btnsEle.findElement(By.cssSelector( .btn-dlpdf a )); } catch (Exception e) { System.out.println( 未找到下载按钮 ); continue; } btnDownloadEle.click(); try { Thread.sleep(5000); } catch (Exception e1) { } List String windowHandles new ArrayList String (driver.getWindowHandles()); if(windowHandles.size() 1) { try { driver.switchTo().window(windowHandles.get(windowHandles.size()-1)); if(driver.getCurrentUrl().contains( https://kdoc.cnki.net/kdoc/download.aspx )) { while (true) { windowHandles new ArrayList String (driver.getWindowHandles()); if(windowHandles.size() 1) break; } windowHandles new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(windowHandles.get(windowHandles.size()-1)); } } catch (Exception e) { windowHandles new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(windowHandles.get(windowHandles.size()-1)); continue; } }else { windowHandles new ArrayList String (driver.getWindowHandles()); driver.switchTo().window(windowHandles.get(windowHandles.size()-1)); } //driver new EdgeDriver(); } //C:\\Users\\wsh\\Downloads //修改下载文件目录 driver new EdgeDriver(); driver.get(journalUrl); WebDriverWait wait new WebDriverWait(driver, 10); List WebElement pageEles null; try { pageEles wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector( .page-list a ))); } catch (Exception e) { driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS); pageEles driver.findElements(By.cssSelector( .page-list a )); } if(pageEles ! null !pageEles.isEmpty()) { Pattern pattern Pattern.compile( [0-9] ); for (int i 0; i pageEles.size(); i ) { //判断页码元素值是否为数字 String pageStr pageEles.get(i).getText().trim(); if( .equals(pageStr)) pageStr pageEles.get(i).getAttribute( innerHTML ).trim(); Matcher matcher pattern.matcher(pageStr); if(matcher.find()) { WebElement yearIssueEle driver.findElement(By.id( yearissue i)); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); if(year.equals(yearEle.getText().trim())) { //指定年 yearEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } wait new WebDriverWait(driver, 10); WebElement issEle wait.until(ExpectedConditions.presenceOfNestedElementLocatedBy(yearEles.get(j), By.tagName( dd ))); //WebElement issEle yearEles.get(j).findElement(By.tagName( dd String isShow issEle.getCssValue( display ); if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { WebElement issueEle issueEles.get(k); if(Integer.parseInt(issue) 10 !issue.contains( 0 )) issue 0 issue; if(issue.equals(issueEle.getAttribute( innerHTML ).replace( No. , ).trim())) { //指定期 issueEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement elements catalogContentEle.findElements(By.cssSelector( dd )); String categoryName ; for (int a 0; a elements.size(); a ) { WebElement element elements.get(a); if( dt .equals(element.getTagName())) { String lanmu element.getAttribute( innerHTML ); if(!categoryName.equals(lanmu)) categoryName lanmu; }else { WebElement authorEle element.findElement(By.cssSelector( .author )); String author authorEle.getAttribute( innerHTML ).trim(); String[] authors author.split( ); WebElement pageEle element.findElement(By.cssSelector( .company )); String page pageEle.getAttribute( innerHTML ).trim(); String publisherID year - issue - page; // 根据页码获取Article 存储栏目 File downPath new File( C:\\\\Users\\\\wsh\\\\Downloads ); File[] listFiles downPath.listFiles(); for (File file : listFiles) { if(file.getName().contains(authors[0])) { try { FileUtils.copyFile(file, new File( D:\\\\webDriver\\\\ journalName \\\\ publisherID .pdf )); file.delete(); } catch (IOException e) { e.printStackTrace(); } } } } } } } } } } } driver.quit(); }else { WebElement yearIssueEle driver.findElement(By.id( yearissue 0 )); List WebElement yearEles yearIssueEle.findElements(By.tagName( dl )); for (int j 0; j yearEles.size(); j ) { WebElement yearEle yearEles.get(j).findElement(By.tagName( dt )); if(year.equals(yearEle.getText().trim())) { //指定年 yearEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } wait new WebDriverWait(driver, 10); WebElement issEle wait.until(ExpectedConditions.presenceOfNestedElementLocatedBy(yearEles.get(j), By.tagName( dd ))); //WebElement issEle yearEles.get(j).findElement(By.tagName( dd String isShow issEle.getCssValue( display ); if( none .equals(isShow)) continue; List WebElement issueEles issEle.findElements(By.tagName( a )); for (int k 0; k issueEles.size(); k ) { WebElement issueEle issueEles.get(k); if(Integer.parseInt(issue) 10 !issue.contains( 0 )) issue 0 issue; if(issue.equals(issueEle.getAttribute( innerHTML ).replace( No. , ).trim())) { //指定期 issueEle.click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } WebElement catalogContentEle driver.findElement(By.id( CataLogContent )); List WebElement elements catalogContentEle.findElements(By.cssSelector( dd )); String categoryName ; for (int a 0; a elements.size(); a ) { WebElement element elements.get(a); if( dt .equals(element.getTagName())) { String lanmu element.getAttribute( innerHTML ); if(!categoryName.equals(lanmu)) categoryName lanmu; }else { WebElement authorEle element.findElement(By.cssSelector( .author )); String author authorEle.getAttribute( innerHTML ).trim(); String[] authors author.split( ); WebElement pageEle element.findElement(By.cssSelector( .company )); String page pageEle.getAttribute( innerHTML ).trim(); String publisherID year - issue - page; // 根据页码获取Article 存储栏目 File downPath new File( C:\\\\Users\\\\wsh\\\\Downloads ); File[] listFiles downPath.listFiles(); for (File file : listFiles) { if(file.getName().contains(authors[0])) { try { FileUtils.copyFile(file, new File( D:\\\\webDriver\\\\ journalName \\\\ publisherID .pdf )); file.delete(); } catch (IOException e) { e.printStackTrace(); } } } } } } } } } driver.quit(); } } public void updateAbstrat(EntityManager entityManager,Article article) { try { WebDriver driver null; ChromeDriverService service null; String chromeDriverPath D:\\\\webDriver\\\\chromedriver.exe ; HashMap String, Object chromePrefs new HashMap String, Object (); ChromeOptions chromeOptions new ChromeOptions(); try { chromePrefs.put( download.default_directory , D:\\\\webDriver ); File chromeDriverFile new File(chromeDriverPath); System.setProperty( webdriver.chrome.driver ,chromeDriverPath); chromeOptions.setExperimentalOption( prefs , chromePrefs); //设置为 headless 模式 必须 chromeOptions.addArguments( --headless ); chromeOptions.addArguments( --disable-gpu ); chromeOptions.addArguments( --no-sandbox ); // 禁止弹出拦截 chromeOptions.addArguments( --disable-popup-blocking ); // 禁止默认浏览器检查 chromeOptions.addArguments( no-default-browser-check ); chromeOptions.addArguments( about:histograms ); chromeOptions.addArguments( about:cache ); chromeOptions.addArguments( --start-maximized ); //创建一个 ChromeDriver 接口 service new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build(); service.start(); } catch (IOException e1) { e1.printStackTrace(); } driver new ChromeDriver(chromeOptions); try { driver.get( https://kns.cnki.net/kcms/detail/detail.aspx?sfield FN dbCode CJFD filename article.getId() tableName CJFD2000 url ); Thread.sleep(1000); List WebElement rowEles driver.findElements(By.className( row )); for (WebElement rowEle : rowEles) { String content rowEle.getText(); if(content.contains( 摘要 )) { try { WebElement moreEle rowEle.findElement(By.id( ChDivSummaryMore )); if(moreEle.isDisplayed()) moreEle.click(); } catch (Exception e) { } String abstractInfo rowEle.findElement(By.id( ChDivSummary )).getAttribute( innerHTML ) .replace( span id \\ ChDivSummary\\ name \\ ChDivSummary\\ class \\ abstract-text\\ , ).replace( /span , ); article.setAbstractinfo(abstractInfo); article.setAbstractinfoCn(abstractInfo); article.setRemark1( cnki ); } } try { entityManager.getTransaction().begin(); entityManager.persist(article); entityManager.getTransaction().commit(); System.out.println(article.getId() :cnki ); } catch (Exception e) { entityManager.getTransaction().rollback(); } } catch (Exception e) { // TODO: handle exception } driver.manage().deleteAllCookies(); driver.quit(); } catch (Exception e) { // TODO: handle exception } }}
\"\" \"\" \"\" 点赞 \"\" \"\" 评论

本文链接: http://cnkint.immuno-online.com/view-713900.html

发布于 : 2021-03-24 阅读(0)
公司介绍
品牌分类
联络我们
服务热线:4000-520-616
(限工作日9:00-18:00)
QQ :1570468124
手机:18915418616