{"id":"https://openalex.org/W4400612131","doi":"https://doi.org/10.48550/arxiv.2407.08739","title":"MAVIS: Mathematical Visual Instruction Tuning with an Automatic Data Engine","display_name":"MAVIS: Mathematical Visual Instruction Tuning with an Automatic Data Engine","publication_year":2024,"publication_date":"2024-07-11","ids":{"openalex":"https://openalex.org/W4400612131","doi":"https://doi.org/10.48550/arxiv.2407.08739"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.08739","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08739","pdf_url":"https://arxiv.org/pdf/2407.08739","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.08739","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086183847","display_name":"Renrui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Renrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033323315","display_name":"Xinyu Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Xinyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102634569","display_name":"Dongzhi Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Dongzhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Guo, Ziyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Ziyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Shicheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100444205","display_name":"Yichi Zhang","orcid":"https://orcid.org/0009-0005-1156-5538"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yichi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113394706","display_name":"Chengzhuo Tong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tong, Chengzhuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100440970","display_name":"Jiaming Liu","orcid":"https://orcid.org/0000-0002-1042-4443"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiaming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041431476","display_name":"Aojun Zhou","orcid":"https://orcid.org/0000-0002-4742-8624"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Aojun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104163654","display_name":"Bin Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Bin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013030532","display_name":"Shanghang Zhang","orcid":"https://orcid.org/0000-0003-4047-3526"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shanghang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073018274","display_name":"Peng Gao","orcid":"https://orcid.org/0009-0000-0848-9814"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Chunyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chunyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100732450","display_name":"Hongsheng Li","orcid":"https://orcid.org/0000-0002-2664-7975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hongsheng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5086183847"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10130","display_name":"Mathematics Education and Teaching Techniques","score":0.4291999936103821,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10130","display_name":"Mathematics Education and Teaching Techniques","score":0.4291999936103821,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5697439312934875},{"id":"https://openalex.org/keywords/mathematics-education","display_name":"Mathematics education","score":0.35128480195999146},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.331213116645813},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.20699062943458557}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5697439312934875},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.35128480195999146},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.331213116645813},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.20699062943458557}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.08739","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08739","pdf_url":"https://arxiv.org/pdf/2407.08739","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.08739","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.08739","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.08739","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08739","pdf_url":"https://arxiv.org/pdf/2407.08739","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400612131.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"The":[0],"mathematical":[1,81,185],"capabilities":[2,218],"of":[3,19,94,219],"Multi-modal":[4],"Large":[5],"Language":[6],"Models":[7],"(MLLMs)":[8],"remain":[9],"under-explored":[10],"with":[11,44,129,171],"three":[12],"areas":[13],"to":[14,51,55,78,90,146,167,191,214],"be":[15,91,232],"improved:":[16],"visual":[17,82,126,160],"encoding":[18],"math":[20,127],"diagrams,":[21],"diagram-language":[22],"alignment,":[23],"and":[24,39,53,108,123,132,200,229],"chain-of-thought":[25],"(CoT)":[26],"reasoning.":[27],"This":[28],"draws":[29],"forth":[30],"an":[31,35,74],"urgent":[32],"demand":[33],"for":[34,71,137,157,196],"effective":[36],"training":[37,138],"paradigm":[38],"a":[40,65,148,172,178],"large-scale,":[41],"comprehensive":[42],"dataset":[43],"detailed":[45],"CoT":[46,109,130,217],"rationales,":[47],"which":[48],"is":[49],"challenging":[50],"collect":[52],"costly":[54],"annotate":[56],"manually.":[57],"To":[58],"tackle":[59],"this":[60,113],"issue,":[61],"we":[62,115,143,163,188,208],"propose":[63,133],"MAVIS,":[64],"MAthematical":[66],"VISual":[67],"instruction":[68,194],"tuning":[69,195],"pipeline":[70],"MLLMs,":[72],"featuring":[73],"automatic":[75],"data":[76,87,230],"engine":[77],"efficiently":[79],"create":[80],"datasets.":[83],"We":[84],"design":[85],"the":[86,103,169,193,202,216],"generation":[88],"process":[89],"entirely":[92],"independent":[93],"human":[95],"intervention":[96],"or":[97],"GPT":[98],"API":[99],"usage,":[100],"while":[101],"ensuring":[102],"diagram-caption":[104,121],"correspondence,":[105],"question-answer":[106],"correctness,":[107],"reasoning":[110,226],"quality.":[111],"With":[112],"approach,":[114],"curate":[116],"two":[117],"datasets,":[118],"MAVIS-Caption":[119,145,166],"(558K":[120],"pairs)":[122],"MAVIS-Instruct":[124,190],"(834K":[125],"problems":[128],"rationales),":[131],"four":[134],"progressive":[135],"stages":[136],"MLLMs":[139],"from":[140],"scratch.":[141],"First,":[142],"utilize":[144],"fine-tune":[147],"math-specific":[149],"vision":[150],"encoder":[151],"(CLIP-Math)":[152],"through":[153],"contrastive":[154],"learning,":[155],"tailored":[156],"improved":[158],"diagram":[159],"encoding.":[161],"Second,":[162],"also":[164],"leverage":[165],"align":[168],"CLIP-Math":[170],"large":[173],"language":[174],"model":[175,204],"(LLM)":[176],"by":[177],"projection":[179],"layer,":[180],"enhancing":[181],"vision-language":[182],"alignment":[183],"in":[184],"domains.":[186],"Third,":[187],"adopt":[189],"perform":[192],"robust":[197],"problem-solving":[198],"skills,":[199],"term":[201],"resulting":[203],"as":[205],"MAVIS-7B.":[206],"Fourth,":[207],"apply":[209],"Direct":[210],"Preference":[211],"Optimization":[212],"(DPO)":[213],"enhance":[215],"our":[220],"model,":[221],"further":[222],"refining":[223],"its":[224],"step-wise":[225],"performance.":[227],"Code":[228],"will":[231],"released":[233],"at":[234],"https://github.com/ZrrSkywalker/MAVIS":[235]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2024-07-14T00:00:00"}
