Webingest (#37)

sammcj · Oct 28, 2024 · 50ee813 · 50ee813
1 parent 7e66c3b
commit 50ee813
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 17 deletions.
diff --git a/web/crawler.go b/web/crawler.go
@@ -39,10 +39,20 @@ type Crawler struct {
 	options         CrawlOptions
 	converter       *md.Converter
 	excludePatterns []string
+	initialPath     string    // Store the initial URL path
+	singlePageMode  bool      // True if crawling a specific page
 }
 
-func NewCrawler(options CrawlOptions) *Crawler {
-	// Create a new converter with GitHub Flavoured Markdown support
+func NewCrawler(options CrawlOptions, startURL string) *Crawler {
+	parsedURL, err := url.Parse(startURL)
+	initialPath := "/"
+	singlePageMode := false
+
+	if err == nil && parsedURL.Path != "" && parsedURL.Path != "/" {
+		initialPath = strings.TrimSuffix(parsedURL.Path, "/")
+		singlePageMode = true
+	}
+	// Create a new converter with GitHub Flavored Markdown support
 	converter := md.NewConverter("", true, &md.Options{
 		// Configure the converter to handle common edge cases
 		StrongDelimiter: "**",
@@ -51,6 +61,7 @@ func NewCrawler(options CrawlOptions) *Crawler {
 		HeadingStyle:    "atx",
 		HorizontalRule: "---",
 		CodeBlockStyle: "fenced",
+		BulletListMarker: "-",
 	})
 
 	// Use GitHub Flavored Markdown plugins
@@ -61,9 +72,11 @@ func NewCrawler(options CrawlOptions) *Crawler {
 	converter.Remove("script", "style", "iframe", "noscript") // Remove unwanted elements
 
 	return &Crawler{
-		visited:   make(map[string]bool),
-		options:   options,
-		converter: converter,
+		visited:        make(map[string]bool),
+		options:        options,
+		converter:      converter,
+		initialPath:    initialPath,
+		singlePageMode: singlePageMode,
 	}
 }
 
@@ -201,21 +214,33 @@ func (c *Crawler) resolveURL(base *url.URL, ref string) string {
 }
 
 func (c *Crawler) isAllowed(urlStr string) bool {
-	if len(c.options.AllowedDomains) == 0 {
-		return true
-	}
-
 	parsedURL, err := url.Parse(urlStr)
 	if err != nil {
 		return false
 	}
 
-	for _, domain := range c.options.AllowedDomains {
-		if strings.Contains(parsedURL.Host, domain) {
-			return true
+	// Check domain restrictions if any
+	if len(c.options.AllowedDomains) > 0 {
+		domainAllowed := false
+		for _, domain := range c.options.AllowedDomains {
+			if strings.Contains(parsedURL.Host, domain) {
+				domainAllowed = true
+				break
+			}
+		}
+		if !domainAllowed {
+			return false
 		}
 	}
-	return false
+
+	// If we're in single page mode, only allow the exact same path
+	if c.singlePageMode {
+		currentPath := strings.TrimSuffix(parsedURL.Path, "/")
+		// Only allow the exact same path or same path with a fragment
+		return currentPath == c.initialPath
+	}
+
+	return true
 }
 
 func (c *Crawler) Crawl(startURL string) ([]*WebPage, error) {

diff --git a/web/integration.go b/web/integration.go
@@ -27,8 +27,8 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
 		return nil, fmt.Errorf("URL must start with http:// or https://")
 	}
 
-	// Initialize crawler
-	crawler := NewCrawler(options)
+	// Initialise crawler with the start URL
+	crawler := NewCrawler(options, urlStr)
 	crawler.SetExcludePatterns(excludePatterns)
 
 	// Perform crawl
@@ -52,8 +52,30 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
 		})
 	}
 
-	// Generate tree representation
-	treeString := generateWebTree(pages)
+	// Generate tree representation, but only if we have more than one page
+	var treeString string
+	if len(files) > 1 {
+		treeString = generateWebTree(pages)
+	} else if len(files) == 1 {
+		treeString = fmt.Sprintf("Web Page: %s", files[0].Path)
+	}
+
+	// If we're crawling a specific page, only return that page's content
+	if parsedURL.Path != "/" && parsedURL.Path != "" {
+		for _, file := range files {
+			fileURL, err := url.Parse(file.Path)
+			if err != nil {
+				continue
+			}
+			// Find the exact matching path (ignoring trailing slashes)
+			if strings.TrimSuffix(fileURL.Path, "/") == strings.TrimSuffix(parsedURL.Path, "/") {
+				return &CrawlResult{
+					TreeString: fmt.Sprintf("Web Page: %s", file.Path),
+					Files:      []filesystem.FileInfo{file},
+				}, nil
+			}
+		}
+	}
 
 	return &CrawlResult{
 		TreeString: treeString,