diff --git a/.gitignore b/.gitignore index 452a72e..fa35f18 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,8 @@ vendor/ # Go workspace file -go.work \ No newline at end of file +go.work + +# h5ai-dl generated files +urls.txt +downloads/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..4505d43 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Launch", + "type": "go", + "request": "launch", + "mode": "auto", + "program": "${fileDirname}" + } + ] +} \ No newline at end of file diff --git a/main.go b/main.go index c47dfac..8f97873 100644 --- a/main.go +++ b/main.go @@ -25,7 +25,7 @@ const ( FolderParentEntry = 2 ) -var baseUrl string +var hostUrl string var threads int64 = 0 @@ -35,6 +35,24 @@ var urlFileMtx sync.Mutex var wg sync.WaitGroup +func getDownloadPath(fileUrl string) (string, error) { + u, err := url.Parse(fileUrl) + if err != nil { + return "", err + } + + parts := []string{"downloads"} + + for _, str := range strings.Split(u.Path, "/") { + str = strings.TrimSpace(str) + if len(str) > 0 { + parts = append(parts, str) + } + } + + return strings.Join(parts, "/"), nil +} + func GetFileSize(name string) (int64, error) { stat, err := os.Stat(name) if err == nil { @@ -206,11 +224,7 @@ func ParseEntry(node *html.Node) { return } - if entryPath[0] == '/' { - entryPath = entryPath[1:] - } - - entryUrl := baseUrl + entryPath + entryUrl := hostUrl + entryPath if entryType == FolderEntry { crawlDirectoryAsync(entryUrl) } else { @@ -225,13 +239,11 @@ func writeUrl(fileUrl string) { urlFile.WriteString(fileUrl + "\n") } func downloadUrl(fileUrl string, downloadSize int64) { - fileName, err := url.QueryUnescape(fileUrl[25:]) + fileName, err := getDownloadPath(fileUrl) if err != nil { return } - fileName = "downloads/" + fileName - folder := filepath.Dir(fileName) if os.MkdirAll(folder, os.ModePerm) != nil { return @@ -268,12 +280,14 @@ func downloadUrl(fileUrl string, downloadSize int64) { for { n, err := resp.Body.Read(buffer) - if err == io.EOF { - return - } else if err != nil { - fmt.Printf("%s: %s\n", fileName, err.Error()) - os.Remove(fileName) - return + if err != nil { + if err == io.EOF { + return + } else { + fmt.Printf("%s: %s\n", fileName, err.Error()) + os.Remove(fileName) + return + } } if n != 4096 { @@ -378,11 +392,20 @@ func main() { return } - baseUrl = os.Args[1] - if baseUrl[len(baseUrl)-1] != '/' { - baseUrl = baseUrl + "/" + requestUrl, err := url.Parse(os.Args[1]) + if err != nil { + printUsage() + println("Error: 1st argument: " + err.Error()) + return + } + if requestUrl.Scheme != "http" && requestUrl.Scheme != "https" { + printUsage() + println("Error: 1st argument is not a http or https url!") + return } + hostUrl = requestUrl.Scheme + "://" + requestUrl.Host + writeUrlOnly, err = strconv.ParseBool(os.Args[2]) if err != nil { printUsage() @@ -397,7 +420,7 @@ func main() { } defer urlFile.Close() - crawlDirectory(baseUrl) + crawlDirectory(requestUrl.String()) time.Sleep(time.Second) wg.Wait()