references.bib


@article{fukushima_neocognitron_1980,
	title = {Neocognitron: a self organizing neural network model for a mechanism of pattern recognition unaffected by shift in position},
	volume = {36},
	issn = {0340-1200},
	shorttitle = {Neocognitron},
	doi = {10.1007/BF00344251},
	abstract = {A neural network model for a mechanism of visual pattern recognition is proposed in this paper. The network is self-organized by "learning without a teacher", and acquires an ability to recognize stimulus patterns based on the geometrical similarity (Gestalt) of their shapes without affected by their positions. This network is given a nickname "neocognitron". After completion of self-organization, the network has a structure similar to the hierarchy model of the visual nervous system proposed by Hubel and Wiesel. The network consists of an input layer (photoreceptor array) followed by a cascade connection of a number of modular structures, each of which is composed of two layers of cells connected in a cascade. The first layer of each module consists of "S-cells", which show characteristics similar to simple cells or lower order hypercomplex cells, and the second layer consists of "C-cells" similar to complex cells or higher order hypercomplex cells. The afferent synapses to each S-cell have plasticity and are modifiable. The network has an ability of unsupervised learning: We do not need any "teacher" during the process of self-organization, and it is only needed to present a set of stimulus patterns repeatedly to the input layer of the network. The network has been simulated on a digital computer. After repetitive presentation of a set of stimulus patterns, each stimulus pattern has become to elicit an output only from one of the C-cells of the last layer, and conversely, this C-cell has become selectively responsive only to that stimulus pattern. That is, none of the C-cells of the last layer responds to more than one stimulus pattern. The response of the C-cells of the last layer is not affected by the pattern's position at all. Neither is it affected by a small change in shape nor in size of the stimulus pattern.},
	language = {eng},
	number = {4},
	journal = {Biological Cybernetics},
	author = {Fukushima, K.},
	year = {1980},
	pmid = {7370364},
	keywords = {Cognition, Computers, Form Perception, Learning, Models, Neurological, Nerve Net, Nervous System Physiological Phenomena, Pattern Recognition, Visual},
	pages = {193--202},
}

@article{lecun_gradient-based_1998,
	title = {Gradient-based learning applied to document recognition},
	volume = {86},
	issn = {00189219},
	url = {http://ieeexplore.ieee.org/document/726791/},
	doi = {10.1109/5.726791},
	abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day.},
	number = {11},
	urldate = {2024-03-02},
	journal = {Proceedings of the IEEE},
	author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
	month = nov,
	year = {1998},
	pages = {2278--2324},
	annote = {[TLDR] This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task, and Convolutional neural networks are shown to outperform all other techniques.},
}

@article{krizhevsky_imagenet_2012,
	title = {{ImageNet} {Classification} with {Deep} {Convolutional} {Neural} {Networks}},
	volume = {25},
	doi = {10.1145/3065386},
	abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif-ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5\% and 17.0\% which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully-connected layers with a final 1000-way softmax. To make train-ing faster, we used non-saturating neurons and a very efficient GPU implemen-tation of the convolution operation. To reduce overfitting in the fully-connected layers we employed a recently-developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3\%, compared to 26.2\% achieved by the second-best entry.},
	journal = {Neural Information Processing Systems},
	author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey},
	month = jan,
	year = {2012},
	file = {Submitted Version:/Users/jcook0312/Zotero/storage/WXIQISIS/Krizhevsky et al. - 2012 - ImageNet Classification with Deep Convolutional Ne.pdf:application/pdf},
}

@article{simonyan_very_2014,
	title = {Very {Deep} {Convolutional} {Networks} for {Large}-{Scale} {Image} {Recognition}},
	abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively.},
	journal = {arXiv 1409.1556},
	author = {Simonyan, Karen and Zisserman, Andrew},
	month = sep,
	year = {2014},
}

@inproceedings{szegedy_going_2015,
	title = {Going deeper with convolutions},
	doi = {10.1109/CVPR.2015.7298594},
	author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
	month = jun,
	year = {2015},
	pages = {1--9},
	file = {Full Text PDF:/Users/jcook0312/Zotero/storage/LXVMH9KR/Szegedy et al. - 2015 - Going deeper with convolutions.pdf:application/pdf},
}

@misc{sabour_dynamic_2017,
	title = {Dynamic {Routing} {Between} {Capsules}},
	url = {http://arxiv.org/abs/1710.09829},
	doi = {10.48550/arXiv.1710.09829},
	abstract = {A capsule is a group of neurons whose activity vector represents the instantiation parameters of a specific type of entity such as an object or an object part. We use the length of the activity vector to represent the probability that the entity exists and its orientation to represent the instantiation parameters. Active capsules at one level make predictions, via transformation matrices, for the instantiation parameters of higher-level capsules. When multiple predictions agree, a higher level capsule becomes active. We show that a discrimininatively trained, multi-layer capsule system achieves state-of-the-art performance on MNIST and is considerably better than a convolutional net at recognizing highly overlapping digits. To achieve these results we use an iterative routing-by-agreement mechanism: A lower-level capsule prefers to send its output to higher level capsules whose activity vectors have a big scalar product with the prediction coming from the lower-level capsule.},
	urldate = {2024-03-02},
	publisher = {arXiv},
	author = {Sabour, Sara and Frosst, Nicholas and Hinton, Geoffrey E.},
	month = nov,
	year = {2017},
	note = {arXiv:1710.09829 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/Users/jcook0312/Zotero/storage/PLMGGM3Z/Sabour et al. - 2017 - Dynamic Routing Between Capsules.pdf:application/pdf;arXiv.org Snapshot:/Users/jcook0312/Zotero/storage/2FZVT4VR/1710.html:text/html},
}

@misc{radford_learning_2021,
	title = {Learning {Transferable} {Visual} {Models} {From} {Natural} {Language} {Supervision}},
	url = {http://arxiv.org/abs/2103.00020},
	doi = {10.48550/arXiv.2103.00020},
	abstract = {State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained model weights at https://github.com/OpenAI/CLIP.},
	urldate = {2024-03-02},
	publisher = {arXiv},
	author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
	month = feb,
	year = {2021},
	note = {arXiv:2103.00020 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/Users/jcook0312/Zotero/storage/4JSXUFVJ/Radford et al. - 2021 - Learning Transferable Visual Models From Natural L.pdf:application/pdf;arXiv.org Snapshot:/Users/jcook0312/Zotero/storage/GX5GCLYL/2103.html:text/html},
}

@article{redmon_you_2016,
	title = {You {Only} {Look} {Once}: {Unified}, {Real}-{Time} {Object} {Detection}},
	shorttitle = {You {Only} {Look} {Once}},
	url = {http://ieeexplore.ieee.org/document/7780460/},
	doi = {10.1109/CVPR.2016.91},
	abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.},
	urldate = {2024-03-02},
	journal = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	author = {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali},
	month = jun,
	year = {2016},
	note = {Conference Name: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)
ISBN: 9781467388511
Place: Las Vegas, NV, USA
Publisher: IEEE},
	pages = {779--788},
	annote = {[TLDR] Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background, and outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.},
	file = {Submitted Version:/Users/jcook0312/Zotero/storage/LVUZS6UC/Redmon et al. - 2016 - You Only Look Once Unified, Real-Time Object Dete.pdf:application/pdf},
}

@misc{chen_big_2020,
	title = {Big {Self}-{Supervised} {Models} are {Strong} {Semi}-{Supervised} {Learners}},
	url = {http://arxiv.org/abs/2006.10029},
	doi = {10.48550/arXiv.2006.10029},
	abstract = {One paradigm for learning from few labeled examples while making best use of a large amount of unlabeled data is unsupervised pretraining followed by supervised fine-tuning. Although this paradigm uses unlabeled data in a task-agnostic way, in contrast to common approaches to semi-supervised learning for computer vision, we show that it is surprisingly effective for semi-supervised learning on ImageNet. A key ingredient of our approach is the use of big (deep and wide) networks during pretraining and fine-tuning. We find that, the fewer the labels, the more this approach (task-agnostic use of unlabeled data) benefits from a bigger network. After fine-tuning, the big network can be further improved and distilled into a much smaller one with little loss in classification accuracy by using the unlabeled examples for a second time, but in a task-specific way. The proposed semi-supervised learning algorithm can be summarized in three steps: unsupervised pretraining of a big ResNet model using SimCLRv2, supervised fine-tuning on a few labeled examples, and distillation with unlabeled examples for refining and transferring the task-specific knowledge. This procedure achieves 73.9\% ImageNet top-1 accuracy with just 1\% of the labels (\${\textbackslash}le\$13 labeled images per class) using ResNet-50, a \$10{\textbackslash}times\$ improvement in label efficiency over the previous state-of-the-art. With 10\% of labels, ResNet-50 trained with our method achieves 77.5\% top-1 accuracy, outperforming standard supervised training with all of the labels.},
	urldate = {2024-03-02},
	publisher = {arXiv},
	author = {Chen, Ting and Kornblith, Simon and Swersky, Kevin and Norouzi, Mohammad and Hinton, Geoffrey},
	month = oct,
	year = {2020},
	note = {arXiv:2006.10029 [cs, stat]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning},
	annote = {Comment: NeurIPS'2020. Code and pretrained models at https://github.com/google-research/simclr},
	file = {arXiv Fulltext PDF:/Users/jcook0312/Zotero/storage/LLRYVZQU/Chen et al. - 2020 - Big Self-Supervised Models are Strong Semi-Supervi.pdf:application/pdf;arXiv.org Snapshot:/Users/jcook0312/Zotero/storage/2SR6EXV6/2006.html:text/html},
}

@article{acosta_multimodal_2022,
	title = {Multimodal biomedical {AI}},
	volume = {28},
	issn = {1546-170X},
	doi = {10.1038/s41591-022-01981-2},
	abstract = {The increasing availability of biomedical data from large biobanks, electronic health records, medical imaging, wearable and ambient biosensors, and the lower cost of genome and microbiome sequencing have set the stage for the development of multimodal artificial intelligence solutions that capture the complexity of human health and disease. In this Review, we outline the key applications enabled, along with the technical and analytical challenges. We explore opportunities in personalized medicine, digital clinical trials, remote monitoring and care, pandemic surveillance, digital twin technology and virtual health assistants. Further, we survey the data, modeling and privacy challenges that must be overcome to realize the full potential of multimodal artificial intelligence in health.},
	language = {eng},
	number = {9},
	journal = {Nature Medicine},
	author = {Acosta, Julián N. and Falcone, Guido J. and Rajpurkar, Pranav and Topol, Eric J.},
	month = sep,
	year = {2022},
	pmid = {36109635},
	keywords = {Artificial Intelligence, Electronic Health Records, Humans, Pandemics, Privacy},
	pages = {1773--1784},
	file = {Full Text:/Users/jcook0312/Zotero/storage/9UCFADVZ/Acosta et al. - 2022 - Multimodal biomedical AI.pdf:application/pdf},
}

@article{liu_artificial_2021,
	title = {Artificial {Intelligence}-{Based} {Image} {Enhancement} in {PET} {Imaging}},
	volume = {16},
	issn = {15568598},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1556859821000444},
	doi = {10.1016/j.cpet.2021.06.005},
	abstract = {Semantic Scholar extracted view of "Artificial Intelligence-Based Image Enhancement in PET Imaging: Noise Reduction and Resolution Enhancement." by Juan Liu et al.},
	language = {en},
	number = {4},
	urldate = {2024-03-02},
	journal = {PET Clinics},
	author = {Liu, Juan and Malekzadeh, Masoud and Mirian, Niloufar and Song, Tzu-An and Liu, Chi and Dutta, Joyita},
	month = oct,
	year = {2021},
	pages = {553--576},
	file = {Accepted Version:/Users/jcook0312/Zotero/storage/FTVILDB2/Liu et al. - 2021 - Artificial Intelligence-Based Image Enhancement in.pdf:application/pdf},
}