Skip to content

Commit

Permalink
Webingest (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
sammcj authored Oct 28, 2024
1 parent 7e66c3b commit 50ee813
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 17 deletions.
51 changes: 38 additions & 13 deletions web/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,20 @@ type Crawler struct {
options CrawlOptions
converter *md.Converter
excludePatterns []string
initialPath string // Store the initial URL path
singlePageMode bool // True if crawling a specific page
}

func NewCrawler(options CrawlOptions) *Crawler {
// Create a new converter with GitHub Flavoured Markdown support
func NewCrawler(options CrawlOptions, startURL string) *Crawler {
parsedURL, err := url.Parse(startURL)
initialPath := "/"
singlePageMode := false

if err == nil && parsedURL.Path != "" && parsedURL.Path != "/" {
initialPath = strings.TrimSuffix(parsedURL.Path, "/")
singlePageMode = true
}
// Create a new converter with GitHub Flavored Markdown support
converter := md.NewConverter("", true, &md.Options{
// Configure the converter to handle common edge cases
StrongDelimiter: "**",
Expand All @@ -51,6 +61,7 @@ func NewCrawler(options CrawlOptions) *Crawler {
HeadingStyle: "atx",
HorizontalRule: "---",
CodeBlockStyle: "fenced",
BulletListMarker: "-",
})

// Use GitHub Flavored Markdown plugins
Expand All @@ -61,9 +72,11 @@ func NewCrawler(options CrawlOptions) *Crawler {
converter.Remove("script", "style", "iframe", "noscript") // Remove unwanted elements

return &Crawler{
visited: make(map[string]bool),
options: options,
converter: converter,
visited: make(map[string]bool),
options: options,
converter: converter,
initialPath: initialPath,
singlePageMode: singlePageMode,
}
}

Expand Down Expand Up @@ -201,21 +214,33 @@ func (c *Crawler) resolveURL(base *url.URL, ref string) string {
}

func (c *Crawler) isAllowed(urlStr string) bool {
if len(c.options.AllowedDomains) == 0 {
return true
}

parsedURL, err := url.Parse(urlStr)
if err != nil {
return false
}

for _, domain := range c.options.AllowedDomains {
if strings.Contains(parsedURL.Host, domain) {
return true
// Check domain restrictions if any
if len(c.options.AllowedDomains) > 0 {
domainAllowed := false
for _, domain := range c.options.AllowedDomains {
if strings.Contains(parsedURL.Host, domain) {
domainAllowed = true
break
}
}
if !domainAllowed {
return false
}
}
return false

// If we're in single page mode, only allow the exact same path
if c.singlePageMode {
currentPath := strings.TrimSuffix(parsedURL.Path, "/")
// Only allow the exact same path or same path with a fragment
return currentPath == c.initialPath
}

return true
}

func (c *Crawler) Crawl(startURL string) ([]*WebPage, error) {
Expand Down
30 changes: 26 additions & 4 deletions web/integration.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
return nil, fmt.Errorf("URL must start with http:// or https://")
}

// Initialize crawler
crawler := NewCrawler(options)
// Initialise crawler with the start URL
crawler := NewCrawler(options, urlStr)
crawler.SetExcludePatterns(excludePatterns)

// Perform crawl
Expand All @@ -52,8 +52,30 @@ func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string
})
}

// Generate tree representation
treeString := generateWebTree(pages)
// Generate tree representation, but only if we have more than one page
var treeString string
if len(files) > 1 {
treeString = generateWebTree(pages)
} else if len(files) == 1 {
treeString = fmt.Sprintf("Web Page: %s", files[0].Path)
}

// If we're crawling a specific page, only return that page's content
if parsedURL.Path != "/" && parsedURL.Path != "" {
for _, file := range files {
fileURL, err := url.Parse(file.Path)
if err != nil {
continue
}
// Find the exact matching path (ignoring trailing slashes)
if strings.TrimSuffix(fileURL.Path, "/") == strings.TrimSuffix(parsedURL.Path, "/") {
return &CrawlResult{
TreeString: fmt.Sprintf("Web Page: %s", file.Path),
Files: []filesystem.FileInfo{file},
}, nil
}
}
}

return &CrawlResult{
TreeString: treeString,
Expand Down

0 comments on commit 50ee813

Please sign in to comment.