[AHK#58] 擷取豆瓣電影網的超簡單AutoHotkey爬蟲腳本程式

使用 AutoHotkey V2 撰寫爬蟲腳本程式,由豆瓣網擷取網頁內容,解析出需要的資料後產生Markdown檔案到Obsidian的儲存庫資料夾裡。

1. movie.ahk 腳本內容


#Requires AutoHotkey v2.0
#SingleInstance Force

DEBUG := true

if (DEBUG) {
  sID := "1294194"  
  sOutputDir := "z:\test\obsidian\collections\Movies\"
} else if (A_Args.Length == 0) {
  MsgBox("需要兩個參數`n範例:movie.ahk2 豆瓣ID 輸出資料夾")
  ExitApp
} else {
  sID := A_Args[1]  
  sOutputDir := A_Args[2] 
}  

;;f1::
  FileEncoding("UTF-8")
  
  url := "https://movie.douban.com/subject/" . sID . "/"
  
  ;;httpClient := ComObjCreate("WinHttp.WinHttpRequest.5.1")  ;; AHK 1.x
  httpClient := ComObject("WinHttp.WinHttpRequest.5.1")
  httpClient.Open("POST", url, false)
  ;httpClient.SetRequestHeader("User-Agent", User-Agent)
  ;httpClient.SetRequestHeader("Content-Type", Content-Type)
  ;httpClient.SetRequestHeader("Cookie", Cookie)

  httpClient.SetRequestHeader("Content-Type", "application/x-www-form-urlencoded")
  httpClient.Send()
  httpClient.WaitForResponse()
  Result := httpClient.ResponseText

  ;;html := ComObjCreate("HTMLFile")  ;; AHK 1.x
  html := ComObject("HTMLFile")
  html.write(Result)
  mainpic := html.getElementById("mainpic")
  text := mainpic.innerHTML
  pos1 := InStr(text, "<img ") text := Substr(text, pos1, 256) pos1 := InStr(text, ">")
  text := Substr(text, 1, pos1)

  pos1 := InStr(text, "src=")
  sPic := Substr(text, pos1+5, 256)
  pos2 := InStr(sPic, '"')
  sPic := Substr(sPic, 1, pos2-1)
  ;;MsgBox(sPic)
  pos1 := InStr(text, " alt=")
  sTitle := Substr(text, pos1+5, 256)
  pos2 := InStr(sTitle, ' ')
  sTitle := Substr(sTitle, 1, pos2-1)
  ;;MsgBox(sTitle)

  divInfo := html.getElementById("info")
  if (divInfo) {
    text := divInfo.innerText
    ;;MsgBox(text)
    text := StrReplace(text, ": ", ":: ")
    sFile := sOutputDir . sTitle . ".md"
    if FileExist(sFile) {
      FileDelete(sFile)
    }
    /*text := Format("---`r`ntemplate-output: Movies`r`ntags: movie`r`ntitle: {1}" .
      "`r`n照片: {2}`r`n豆瓣ID: {3}`r`n---`r`n# {4}" . 
      "`r`n`r`n![|300]({5})`r`n`r`n{6}", sTitle, sPic, sID, sTitle, sPic, text)
    */
    sFormat := "
    (
      ---
      template-output: Movies
      tags: movie
      title: {1}"
      照片: {2}
      豆瓣ID: {3}
      ---
      # {4}

      ![|300]({5})

      {6}

    )"

    text := Format(sFormat, sTitle, sPic, sID, sTitle, sPic, text)
    FileAppend(text, sFile)
  } else {
    MsgBox("CAnnot find info div")
  }
  return

2. 相關鏈接

3. 教學影片

##

您可能也會有興趣的類似文章

您可能也會喜歡…

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *